Fix crash when using the "matches" operator on non-UTF-8 data
GRegex is a thin wrapper around PCRE. Inputs (patterns and subjects) are assumed to be UTF-8 by default (unless G_REGEX_RAW is set). If the subject is not valid UTF-8, normally pcre_exec will immediately return a failure. However, as GLib sets PCRE_NO_UTF8_CHECK when G_REGEX_RAW is given, pcre_exec() will skip the safety check and crash instead. Fix this by always assuming raw byte patterns. Regression risk: patterns such as `ö.ï` will no longer match `öñï` since `ñ` is a multi-byte sequence. Patterns such as `(GET|POST) /` remain functional though. Bug: 14905 Change-Id: I6450bb83f565d377f82a5dbb01690c5f49acd96f Reviewed-on: https://code.wireshark.org/review/31935 Petri-Dish: Peter Wu <peter@lekensteyn.nl> Tested-by: Petri Dish Buildbot Reviewed-by: Anders Broman <a.broman58@gmail.com>
This commit is contained in:
parent
f2dc64e9b8
commit
0ca65a66f4
|
@ -665,30 +665,6 @@ cmp_matches(const fvalue_t *fv_a, const fvalue_t *fv_b)
|
|||
if (! regex) {
|
||||
return FALSE;
|
||||
}
|
||||
/*
|
||||
* XXX - do we want G_REGEX_RAW or not?
|
||||
*
|
||||
* If we're matching against a string, we don't want it (and
|
||||
* we want the string value encoded in UTF-8 - and, if it can't
|
||||
* be converted to UTF-8, because it's in a character encoding
|
||||
* that doesn't map every possible byte sequence to Unicode (and
|
||||
* that includes strings that are supposed to be in UTF-8 but
|
||||
* that contain invalid UTF-8 sequences!), treat the match as
|
||||
* failing.
|
||||
*
|
||||
* If we're matching against binary data, and matching a binary
|
||||
* pattern (e.g. "0xfa, 3 or more 0xff, and 0x37, in order"),
|
||||
* we'd want G_REGEX_RAW. If we're matching a text pattern,
|
||||
* it's not clear *what* the right thing to do is - if they're
|
||||
* matching against a pattern containing non-ASCII characters,
|
||||
* they might want it to match in whatever encoding the binary
|
||||
* data is, but Wireshark might not have a clue what that
|
||||
* encoding is. In addition, it's not clear how to tell
|
||||
* whether a pattern is "binary" or not, short of having
|
||||
* a different (non-PCRE) syntax for binary patterns.
|
||||
*
|
||||
* So we don't use G_REGEX_RAW for now.
|
||||
*/
|
||||
return g_regex_match_full(
|
||||
regex, /* Compiled PCRE */
|
||||
(char *)a->data, /* The data to check for the pattern... */
|
||||
|
|
|
@ -33,28 +33,6 @@ gregex_fvalue_free(fvalue_t *fv)
|
|||
}
|
||||
}
|
||||
|
||||
/* Determines whether pattern needs to match raw byte sequences */
|
||||
static gboolean
|
||||
raw_flag_needed(const gchar *pattern)
|
||||
{
|
||||
gboolean found = FALSE;
|
||||
const gchar *s = pattern;
|
||||
size_t i, len;
|
||||
|
||||
/* find any character whose hex value is two letters */
|
||||
len = strlen(s);
|
||||
for (i = 0; i < len; i++) {
|
||||
/* Upper and lower-nibble must be >= 0xA */
|
||||
if ((guchar)(s[i] & 0xF0) >= 0xA0 &&
|
||||
(guchar)(s[i] & 0x0F) >= 0x0A)
|
||||
{
|
||||
found = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/* Generate a FT_PCRE from a parsed string pattern.
|
||||
* On failure, if err_msg is non-null, set *err_msg to point to a
|
||||
* g_malloc()ed error message. */
|
||||
|
@ -64,12 +42,16 @@ val_from_string(fvalue_t *fv, const char *pattern, gchar **err_msg)
|
|||
GError *regex_error = NULL;
|
||||
GRegexCompileFlags cflags = (GRegexCompileFlags)(G_REGEX_CASELESS | G_REGEX_OPTIMIZE);
|
||||
|
||||
/* Set RAW flag only if pattern requires matching raw byte
|
||||
sequences. Otherwise, omit it so that GRegex treats its
|
||||
input as UTF8-encoded string. */
|
||||
if (raw_flag_needed(pattern)) {
|
||||
cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW);
|
||||
}
|
||||
/*
|
||||
* As FT_BYTES and FT_PROTOCOL contain arbitrary binary data and FT_STRING
|
||||
* is not guaranteed to contain valid UTF-8, we have to disable support for
|
||||
* UTF-8 patterns and treat every pattern and subject as raw bytes.
|
||||
*
|
||||
* Should support for UTF-8 patterns be necessary, then we should compile a
|
||||
* pattern without G_REGEX_RAW. Additionally, we MUST use g_utf8_validate()
|
||||
* before calling g_regex_match_full() or risk crashes.
|
||||
*/
|
||||
cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW);
|
||||
|
||||
/* Free up the old value, if we have one */
|
||||
gregex_fvalue_free(fv);
|
||||
|
|
Loading…
Reference in New Issue