diff --git a/epan/ftypes/ftype-bytes.c b/epan/ftypes/ftype-bytes.c index c1d57f0bbd..9bfc37b637 100644 --- a/epan/ftypes/ftype-bytes.c +++ b/epan/ftypes/ftype-bytes.c @@ -665,30 +665,6 @@ cmp_matches(const fvalue_t *fv_a, const fvalue_t *fv_b) if (! regex) { return FALSE; } - /* - * XXX - do we want G_REGEX_RAW or not? - * - * If we're matching against a string, we don't want it (and - * we want the string value encoded in UTF-8 - and, if it can't - * be converted to UTF-8, because it's in a character encoding - * that doesn't map every possible byte sequence to Unicode (and - * that includes strings that are supposed to be in UTF-8 but - * that contain invalid UTF-8 sequences!), treat the match as - * failing. - * - * If we're matching against binary data, and matching a binary - * pattern (e.g. "0xfa, 3 or more 0xff, and 0x37, in order"), - * we'd want G_REGEX_RAW. If we're matching a text pattern, - * it's not clear *what* the right thing to do is - if they're - * matching against a pattern containing non-ASCII characters, - * they might want it to match in whatever encoding the binary - * data is, but Wireshark might not have a clue what that - * encoding is. In addition, it's not clear how to tell - * whether a pattern is "binary" or not, short of having - * a different (non-PCRE) syntax for binary patterns. - * - * So we don't use G_REGEX_RAW for now. - */ return g_regex_match_full( regex, /* Compiled PCRE */ (char *)a->data, /* The data to check for the pattern... */ diff --git a/epan/ftypes/ftype-pcre.c b/epan/ftypes/ftype-pcre.c index 5c9ad9f97b..ac854c73a2 100644 --- a/epan/ftypes/ftype-pcre.c +++ b/epan/ftypes/ftype-pcre.c @@ -33,28 +33,6 @@ gregex_fvalue_free(fvalue_t *fv) } } -/* Determines whether pattern needs to match raw byte sequences */ -static gboolean -raw_flag_needed(const gchar *pattern) -{ - gboolean found = FALSE; - const gchar *s = pattern; - size_t i, len; - - /* find any character whose hex value is two letters */ - len = strlen(s); - for (i = 0; i < len; i++) { - /* Upper and lower-nibble must be >= 0xA */ - if ((guchar)(s[i] & 0xF0) >= 0xA0 && - (guchar)(s[i] & 0x0F) >= 0x0A) - { - found = TRUE; - break; - } - } - return found; -} - /* Generate a FT_PCRE from a parsed string pattern. * On failure, if err_msg is non-null, set *err_msg to point to a * g_malloc()ed error message. */ @@ -64,12 +42,16 @@ val_from_string(fvalue_t *fv, const char *pattern, gchar **err_msg) GError *regex_error = NULL; GRegexCompileFlags cflags = (GRegexCompileFlags)(G_REGEX_CASELESS | G_REGEX_OPTIMIZE); - /* Set RAW flag only if pattern requires matching raw byte - sequences. Otherwise, omit it so that GRegex treats its - input as UTF8-encoded string. */ - if (raw_flag_needed(pattern)) { - cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW); - } + /* + * As FT_BYTES and FT_PROTOCOL contain arbitrary binary data and FT_STRING + * is not guaranteed to contain valid UTF-8, we have to disable support for + * UTF-8 patterns and treat every pattern and subject as raw bytes. + * + * Should support for UTF-8 patterns be necessary, then we should compile a + * pattern without G_REGEX_RAW. Additionally, we MUST use g_utf8_validate() + * before calling g_regex_match_full() or risk crashes. + */ + cflags = (GRegexCompileFlags)(cflags | G_REGEX_RAW); /* Free up the old value, if we have one */ gregex_fvalue_free(fv);