forked from osmocom/wireshark
Add sse4.2 optimized function ws_mempbrk_sse42()
In text protocols, like SIP, lot of time is spend guint8_pbrk(), assume that text is not binary (no NULs), and use SSE4.2 pcmpistri instruction. Also move & rename guint8_pbrk() from tvbuff.c as _ws_mempbrk. HAVE_SSE42 must be defined to use _ws_mempbrk_sse42() only activaded for Windows currently. Change-Id: Ic853d84805bdb6492c4f45d2bcc79a973fd9804e Reviewed-on: https://code.wireshark.org/review/1730 Reviewed-by: Anders Broman <a.broman58@gmail.com>
This commit is contained in:
parent
6669566199
commit
fcb710baec
|
@ -281,3 +281,6 @@
|
|||
#if !defined(QT_VERSION) || !defined(_SSIZE_T_DEFINED)
|
||||
typedef int ssize_t;
|
||||
#endif
|
||||
|
||||
/* to use define _ws_mempbrk_sse42 if available (checked with cpuinfo) */
|
||||
#define HAVE_SSE42 1
|
|
@ -739,26 +739,17 @@ fast_ensure_contiguous(tvbuff_t *tvb, const gint offset, const guint length)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
extern const guint8 *ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles);
|
||||
|
||||
static inline const guint8*
|
||||
guint8_pbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles, guchar *found_needle)
|
||||
{
|
||||
gchar tmp[256] = { 0 };
|
||||
const guint8 *haystack_end;
|
||||
const guint8 *result = ws_mempbrk(haystack, haystacklen, needles);
|
||||
|
||||
while (*needles)
|
||||
tmp[*needles++] = 1;
|
||||
if (result && found_needle)
|
||||
*found_needle = *result;
|
||||
|
||||
haystack_end = haystack + haystacklen;
|
||||
while (haystack < haystack_end) {
|
||||
if (tmp[*haystack]) {
|
||||
if (found_needle)
|
||||
*found_needle = *haystack;
|
||||
return haystack;
|
||||
}
|
||||
haystack++;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
@ -2963,6 +2954,12 @@ tvb_get_nstringz0(tvbuff_t *tvb, const gint offset, const guint bufsize, guint8*
|
|||
gint
|
||||
tvb_find_line_end(tvbuff_t *tvb, const gint offset, int len, gint *next_offset, const gboolean desegment)
|
||||
{
|
||||
#ifdef WIN32
|
||||
static const char __declspec(align(16)) crlf[] = "\r\n" ;
|
||||
#else
|
||||
static const char crlf[] __attribute__((aligned(16))) = "\r\n" ;
|
||||
#endif
|
||||
|
||||
gint eob_offset;
|
||||
gint eol_offset;
|
||||
int linelen;
|
||||
|
@ -2981,7 +2978,7 @@ tvb_find_line_end(tvbuff_t *tvb, const gint offset, int len, gint *next_offset,
|
|||
/*
|
||||
* Look either for a CR or an LF.
|
||||
*/
|
||||
eol_offset = tvb_pbrk_guint8(tvb, offset, len, "\r\n", &found_needle);
|
||||
eol_offset = tvb_pbrk_guint8(tvb, offset, len, crlf, &found_needle);
|
||||
if (eol_offset == -1) {
|
||||
/*
|
||||
* No CR or LF - line is presumably continued in next packet.
|
||||
|
|
|
@ -52,6 +52,10 @@ if NEED_STRPTIME_LO
|
|||
wsutil_optional_objects += @STRPTIME_LO@
|
||||
endif
|
||||
|
||||
## if SSE42_SUPPORTED
|
||||
wsutil_optional_objects += libwsutil_sse42.la
|
||||
## endif
|
||||
|
||||
include ../Makefile.am.inc
|
||||
|
||||
include Makefile.common
|
||||
|
@ -62,6 +66,8 @@ if HAVE_WARNINGS_AS_ERRORS
|
|||
AM_CFLAGS += -Werror
|
||||
endif
|
||||
|
||||
noinst_LTLIBRARIES = libwsutil_sse42.la
|
||||
|
||||
lib_LTLIBRARIES = libwsutil.la
|
||||
# http://www.gnu.org/software/libtool/manual/html_node/Updating-version-info.html
|
||||
libwsutil_la_LDFLAGS = -version-info 0:0:0 @LDFLAGS_SHAREDLIB@
|
||||
|
@ -74,6 +80,11 @@ libwsutil_la_SOURCES = \
|
|||
$(LIBWSUTIL_SRC) \
|
||||
$(LIBWSUTIL_INCLUDES)
|
||||
|
||||
libwsutil_sse42_la_SOURCES = \
|
||||
ws_mempbrk_sse42.c
|
||||
|
||||
libwsutil_sse42_la_CFLAGS = $(AM_CFLAGS) -msse4.2
|
||||
|
||||
EXTRA_libwsutil_la_SOURCES= \
|
||||
inet_aton.c \
|
||||
inet_aton.h \
|
||||
|
|
|
@ -60,6 +60,7 @@ LIBWSUTIL_SRC = \
|
|||
tempfile.c \
|
||||
time_util.c \
|
||||
type_util.c \
|
||||
ws_mempbrk.c \
|
||||
u3.c \
|
||||
unicode-utils.c
|
||||
|
||||
|
|
|
@ -27,7 +27,8 @@ OBJECTS = file_util.obj \
|
|||
inet_pton.obj \
|
||||
$(LIBWSUTIL_SRC:.c=.obj) \
|
||||
strptime.obj \
|
||||
wsgetopt.obj
|
||||
wsgetopt.obj \
|
||||
ws_mempbrk_sse42.obj
|
||||
|
||||
# For use when making libwsutil.dll
|
||||
libwsutil.lib: libwsutil.dll
|
||||
|
|
|
@ -0,0 +1,74 @@
|
|||
/* ws_mempbrk.c
|
||||
*
|
||||
* Wireshark - Network traffic analyzer
|
||||
* By Gerald Combs <gerald@wireshark.org>
|
||||
* Copyright 1998 Gerald Combs
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <glib.h>
|
||||
#include "ws_symbol_export.h"
|
||||
#include "ws_cpuid.h"
|
||||
|
||||
#ifdef HAVE_SSE42
|
||||
extern const char *_ws_mempbrk_sse42(const char* haystack, size_t haystacklen, const char *needles);
|
||||
#endif
|
||||
|
||||
const guint8 *_ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles);
|
||||
|
||||
const guint8 *
|
||||
_ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles)
|
||||
{
|
||||
gchar tmp[256] = { 0 };
|
||||
const guint8 *haystack_end;
|
||||
|
||||
while (*needles)
|
||||
tmp[*needles++] = 1;
|
||||
|
||||
haystack_end = haystack + haystacklen;
|
||||
while (haystack < haystack_end) {
|
||||
if (tmp[*haystack])
|
||||
return haystack;
|
||||
haystack++;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
WS_DLL_PUBLIC const guint8 *
|
||||
ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles)
|
||||
{
|
||||
#ifdef HAVE_SSE42
|
||||
guint32 CPUInfo[4];
|
||||
guint32 bSSE42Extensions;
|
||||
/*const int has_sse42 = 1;*/ /* XXX, use CPUID */
|
||||
#endif
|
||||
if (*needles == 0)
|
||||
return NULL;
|
||||
|
||||
#ifdef HAVE_SSE42
|
||||
ws_cpuid(CPUInfo, 1);
|
||||
|
||||
bSSE42Extensions = (CPUInfo[2] & 0x100000);
|
||||
|
||||
if (haystacklen >= 16 && bSSE42Extensions)
|
||||
return _ws_mempbrk_sse42(haystack, haystacklen, needles);
|
||||
#endif
|
||||
|
||||
return _ws_mempbrk(haystack, haystacklen, needles);
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
/* strcspn with SSE4.2 intrinsics
|
||||
Copyright (C) 2009-2014 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <glib.h>
|
||||
|
||||
#ifdef WIN32
|
||||
#include <tmmintrin.h>
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#include <nmmintrin.h>
|
||||
#include <string.h>
|
||||
|
||||
extern const guint8 *_ws_mempbrk(const guint8* haystack, size_t haystacklen, const guint8 *needles);
|
||||
const char *_ws_mempbrk_sse42(const char* haystack, size_t haystacklen, const char *needles);
|
||||
|
||||
/* Helper for variable shifts of SSE registers.
|
||||
Copyright (C) 2010 Free Software Foundation, Inc.
|
||||
*/
|
||||
|
||||
static const int8_t ___m128i_shift_right[31] =
|
||||
{
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
};
|
||||
|
||||
static inline __m128i
|
||||
__m128i_shift_right (__m128i value, unsigned long int offset)
|
||||
{
|
||||
/* _mm_loadu_si128() works with unaligned data, cast safe */
|
||||
return _mm_shuffle_epi8 (value,
|
||||
_mm_loadu_si128 ((__m128i *) (void *) (___m128i_shift_right + offset)));
|
||||
}
|
||||
|
||||
/* We use 0x2:
|
||||
_SIDD_SBYTE_OPS
|
||||
| _SIDD_CMP_EQUAL_ANY
|
||||
| _SIDD_POSITIVE_POLARITY
|
||||
| _SIDD_LEAST_SIGNIFICANT
|
||||
on pcmpistri to compare xmm/mem128
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
X X X X X X X X X X X X X X X X
|
||||
|
||||
against xmm
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
A A A A A A A A A A A A A A A A
|
||||
|
||||
to find out if the first 16byte data element has any byte A and
|
||||
the offset of the first byte. There are 3 cases:
|
||||
|
||||
1. The first 16byte data element has the byte A at the offset X.
|
||||
2. The first 16byte data element has EOS and doesn't have the byte A.
|
||||
3. The first 16byte data element is valid and doesn't have the byte A.
|
||||
|
||||
Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
|
||||
|
||||
1 X 1 0/1 0
|
||||
2 16 0 1 0
|
||||
3 16 0 0 0
|
||||
|
||||
We exit from the loop for cases 1 and 2 with jbe which branches
|
||||
when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
|
||||
X for case 1. */
|
||||
|
||||
const char *
|
||||
_ws_mempbrk_sse42(const char *s, size_t slen, const char *a)
|
||||
{
|
||||
const char *aligned;
|
||||
__m128i mask;
|
||||
int offset;
|
||||
|
||||
offset = (int) ((size_t) a & 15);
|
||||
aligned = (const char *) ((size_t) a & -16L);
|
||||
if (offset != 0)
|
||||
{
|
||||
int length;
|
||||
|
||||
/* Load masks. */
|
||||
/* cast safe - _mm_load_si128() it's 16B aligned */
|
||||
mask = __m128i_shift_right(_mm_load_si128 ((__m128i *) (void *) aligned), offset);
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
if (length == 16 - offset)
|
||||
{
|
||||
/* There is no NULL terminator. */
|
||||
__m128i mask1 = _mm_load_si128 ((__m128i *) (void *) (aligned + 16));
|
||||
int index = _mm_cmpistri (mask1, mask1, 0x3a);
|
||||
length += index;
|
||||
|
||||
/* Don't use SSE4.2 if the length of A > 16. */
|
||||
if (length > 16)
|
||||
return _ws_mempbrk(s, slen, a);
|
||||
|
||||
if (index != 0)
|
||||
{
|
||||
/* Combine mask0 and mask1. We could play games with
|
||||
palignr, but frankly this data should be in L1 now
|
||||
so do the merge via an unaligned load. */
|
||||
mask = _mm_loadu_si128 ((__m128i *) (void *) a);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int length;
|
||||
|
||||
/* A is aligned. (cast safe) */
|
||||
mask = _mm_load_si128 ((__m128i *) (void *) a);
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
if (length == 16)
|
||||
{
|
||||
/* There is no NULL terminator. Don't use SSE4.2 if the length
|
||||
of A > 16. */
|
||||
if (a[16] != 0)
|
||||
return _ws_mempbrk(s, slen, a);
|
||||
}
|
||||
}
|
||||
|
||||
offset = (int) ((size_t) s & 15);
|
||||
aligned = (const char *) ((size_t) s & -16L);
|
||||
if (offset != 0)
|
||||
{
|
||||
/* Check partial string. cast safe it's 16B aligned */
|
||||
__m128i value = __m128i_shift_right (_mm_load_si128 ((__m128i *) (void *) aligned), offset);
|
||||
|
||||
int length = _mm_cmpistri (mask, value, 0x2);
|
||||
/* No need to check ZFlag since ZFlag is always 1. */
|
||||
int cflag = _mm_cmpistrc (mask, value, 0x2);
|
||||
int index = _mm_cmpistri (value, value, 0x3a);
|
||||
|
||||
if (cflag)
|
||||
return s + length;
|
||||
/* Find where the NULL terminator is. */
|
||||
if (index < 16 - offset)
|
||||
{
|
||||
/* fond NUL @ 'index', need to switch to slower mempbrk */
|
||||
return _ws_mempbrk(s + index + 1, slen - index - 1, a); /* slen is bigger than 16 & index < 16 so no undeflow here */
|
||||
}
|
||||
aligned += 16;
|
||||
slen -= (16 - offset);
|
||||
}
|
||||
else
|
||||
aligned = s;
|
||||
|
||||
while (slen >= 16)
|
||||
{
|
||||
__m128i value = _mm_load_si128 ((__m128i *) (void *) aligned);
|
||||
int index = _mm_cmpistri (mask, value, 0x2);
|
||||
int cflag = _mm_cmpistrc (mask, value, 0x2);
|
||||
int zflag = _mm_cmpistrz (mask, value, 0x2);
|
||||
|
||||
if (cflag)
|
||||
return aligned + index;
|
||||
if (zflag)
|
||||
{
|
||||
/* found NUL, need to switch to slower mempbrk */
|
||||
return _ws_mempbrk(aligned, slen, a);
|
||||
}
|
||||
aligned += 16;
|
||||
slen -= 16;
|
||||
}
|
||||
|
||||
/* XXX, use mempbrk_slow here? */
|
||||
return _ws_mempbrk(aligned, slen, a);
|
||||
}
|
Loading…
Reference in New Issue