epan: Add BASE_SHOW_UTF_8_PRINTABLE

Add BASE_SHOW_UTF_8_PRINTABLE and related function tvb_utf_8_isprint
for supporting fields of bytes that are "maybe UTF-8" (default or
SHOULD be UTF-8 but could be something else, with no encoding indicator),
such as SSID fields in IEEE 802.11 (See #16208), certain OctetString
fields in Diameter or PFCP, and other places where
BASE_SHOW_ASCII_PRINTABLE is currently used. Fix #5307
This commit is contained in:
John Thacker 2022-02-03 08:28:11 -05:00 committed by A Wireshark GitLab Utility
parent ebe22f7b7b
commit 25d0c88251
9 changed files with 83 additions and 12 deletions

View File

@ -1883,6 +1883,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
tvb_uncompress_lz77huff@Base 3.1.0
tvb_uncompress_lznt1@Base 3.1.0
tvb_unicode_strsize@Base 1.9.1
tvb_utf_8_isprint@Base 3.7.0
tvb_ws_mempbrk_pattern_guint8@Base 1.99.3
tvbparse_casestring@Base 1.9.1
tvbparse_char@Base 1.9.1

View File

@ -132,7 +132,8 @@ FIELDDISPLAY --For FT_UINT{8,16,24,32,40,48,56,64} and
BASE_CUSTOM, or BASE_NONE, possibly ORed with
BASE_RANGE_STRING, BASE_EXT_STRING, BASE_VAL64_STRING,
BASE_ALLOW_ZERO, BASE_UNIT_STRING, BASE_SPECIAL_VALS,
BASE_NO_DISPLAY_VALUE, or BASE_SHOW_ASCII_PRINTABLE
BASE_NO_DISPLAY_VALUE, BASE_SHOW_ASCII_PRINTABLE, or
BASE_SHOW_UTF_8_PRINTABLE
BASE_NONE may be used with a non-NULL FIELDCONVERT when the
numeric value of the field itself is not of significance to
@ -182,8 +183,8 @@ FIELDDISPLAY --For FT_UINT{8,16,24,32,40,48,56,64} and
SEP_DOT, SEP_DASH, SEP_COLON, or SEP_SPACE to provide
a separator between bytes; BASE_NONE has no separator
between bytes. These can be ORed with BASE_ALLOW_ZERO
and BASE_SHOW_ASCII_PRINTABLE.
between bytes. These can be ORed with BASE_ALLOW_ZERO,
BASE_SHOW_ASCII_PRINTABLE, or BASE_SHOW_UTF_8_PRINTABLE.
BASE_ALLOW_ZERO displays <none> instead of <MISSING>
for a zero-sized byte array.
@ -192,6 +193,11 @@ FIELDDISPLAY --For FT_UINT{8,16,24,32,40,48,56,64} and
characters and, if so, will display the field's value
as a string, in quotes. The value will still be
filterable as a byte value.
BASE_SHOW_UTF_8_PRINTABLE will check whether the
field's value is valid UTF-8 consisting entirely of
printable characters and, if so, will display the field's
value as a string, in quotes. The value will still be
filterable as a byte value.
--For FT_IPv4:

View File

@ -59,6 +59,7 @@ static ws_enum_t all_enums[] = {
ENUM(BASE_PT_UDP),
ENUM(BASE_RANGE_STRING),
ENUM(BASE_SHOW_ASCII_PRINTABLE),
ENUM(BASE_SHOW_UTF_8_PRINTABLE),
ENUM(BASE_SPECIAL_VALS),
ENUM(BASE_UNIT_STRING),
ENUM(BASE_VAL64_STRING),

View File

@ -1082,7 +1082,18 @@ hfinfo_format_bytes(wmem_allocator_t *scope, const header_field_info *hfinfo,
gboolean is_printable;
if (bytes) {
if (hfinfo->display & BASE_SHOW_ASCII_PRINTABLE) {
if (hfinfo->display & BASE_SHOW_UTF_8_PRINTABLE) {
/*
* If all bytes are valid and printable UTF-8, show the
* bytes as a string - in quotes to indicate that it's
* a string.
*/
if (isprint_utf8_string(bytes, length)) {
str = wmem_strdup_printf(scope, "\"%.*s\"",
(int)length, bytes);
return str;
}
} else if (hfinfo->display & BASE_SHOW_ASCII_PRINTABLE) {
/*
* Check whether all bytes are printable.
*/

View File

@ -718,6 +718,8 @@ typedef enum {
#define BASE_SHOW_ASCII_PRINTABLE 0x00010000 /**< show byte array as ASCII if it's all printable characters */
#define BASE_SHOW_UTF_8_PRINTABLE 0x00020000 /**< show byte array as UTF-8 if it's all valid and printable UTF-8 characters */
/** BASE_ values that cause the field value to be displayed twice */
#define IS_BASE_DUAL(b) ((b)==BASE_DEC_HEX||(b)==BASE_HEX_DEC)

View File

@ -3884,6 +3884,18 @@ gboolean tvb_ascii_isprint(tvbuff_t *tvb, const gint offset, const gint length)
return TRUE;
}
gboolean tvb_utf_8_isprint(tvbuff_t *tvb, const gint offset, const gint length)
{
const guint8* buf = tvb_get_ptr(tvb, offset, length);
guint abs_offset, abs_length = length;
if (length == -1) {
/* tvb_get_ptr has already checked for exceptions. */
compute_offset_and_remaining(tvb, offset, &abs_offset, &abs_length);
}
return isprint_utf8_string(buf, abs_length);
}
static ws_mempbrk_pattern pbrk_crlf;
/*

View File

@ -809,6 +809,18 @@ WS_DLL_PUBLIC gint tvb_get_raw_bytes_as_string(tvbuff_t *tvb, const gint offset,
WS_DLL_PUBLIC gboolean tvb_ascii_isprint(tvbuff_t *tvb, const gint offset,
const gint length);
/** Iterates over the provided portion of the tvb checking that it is
* valid UTF-8 consisting entirely of printable characters. (The characters
* must be complete; if the portion ends in a partial sequence that could
* begin a valid character, this returns FALSE.) The length may be -1 for
* "all the way to the end of the tvbuff".
* Returns TRUE if printable, FALSE otherwise
*
* @see isprint_utf8_string()
*/
WS_DLL_PUBLIC gboolean tvb_utf_8_isprint(tvbuff_t *tvb, const gint offset,
const gint length);
/**
* Given a tvbuff, an offset into the tvbuff, and a length that starts
* at that offset (which may be -1 for "all the way to the end of the

View File

@ -273,18 +273,24 @@ isprint_string(const gchar *str)
/* Check if an entire UTF-8 string is printable. */
gboolean
isprint_utf8_string(const gchar *str, guint length)
isprint_utf8_string(const gchar *str, const guint length)
{
const char *c;
const gchar *strend = str + length;
if (!g_utf8_validate (str, length, NULL)) {
if (!g_utf8_validate_len(str, length, NULL)) {
return FALSE;
}
for (c = str; *c; c = g_utf8_next_char(c)) {
if (!g_unichar_isprint(g_utf8_get_char(c))) {
while (str < strend) {
/* This returns false for G_UNICODE_CONTROL | G_UNICODE_FORMAT |
* G_UNICODE_UNASSIGNED | G_UNICODE_SURROGATE
* XXX: Could it be ok to have certain format characters, e.g.
* U+00AD SOFT HYPHEN? If so, format_text() should be changed too.
*/
if (!g_unichar_isprint(g_utf8_get_char(str))) {
return FALSE;
}
str = g_utf8_next_char(str);
}
return TRUE;

View File

@ -114,14 +114,34 @@ gchar *ascii_strup_inplace(gchar *str);
WS_DLL_PUBLIC
gboolean isprint_string(const gchar *str);
/** Check if an entire UTF-8 string consists of printable characters
/** Given a not-necessarily-null-terminated string, expected to be in
* UTF-8 but possibly containing invalid sequences (as it may have come
* from packet data), and the length of the string, deterimine if the
* string is valid UTF-8 consisting entirely of printable characters.
*
* This means that it:
*
* does not contain an illegal UTF-8 sequence (including overlong encodings,
* the sequences reserved for UTF-16 surrogate halves, and the values for
* code points above U+10FFFF that are no longer in Unicode)
*
* does not contain a non-printable Unicode character such as control
* characters (including internal NULL bytes)
*
* does not end in a partial sequence that could begin a valid character;
*
* does not start with a partial sequence that could end a valid character;
*
* and thus guarantees that the result of format_text() would be the same as
* that of wmem_strndup() with the same parameters.
*
* @param str The string to be checked
* @param length The number of bytes to validate
* @return TRUE if the entire string is printable, otherwise FALSE
* @return TRUE if the entire string is valid and printable UTF-8,
* otherwise FALSE
*/
WS_DLL_PUBLIC
gboolean isprint_utf8_string(const gchar *str, guint length);
gboolean isprint_utf8_string(const gchar *str, const guint length);
/** Check if an entire string consists of digits
*