epan: Add BASE_SHOW_UTF_8_PRINTABLE

Add BASE_SHOW_UTF_8_PRINTABLE and related function tvb_utf_8_isprint for supporting fields of bytes that are "maybe UTF-8" (default or SHOULD be UTF-8 but could be something else, with no encoding indicator), such as SSID fields in IEEE 802.11 (See #16208), certain OctetString fields in Diameter or PFCP, and other places where BASE_SHOW_ASCII_PRINTABLE is currently used. Fix #5307
2022-02-03 08:28:11 -05:00 · 2022-02-03 08:28:11 -05:00 · 25d0c88251
parent ebe22f7b7b
commit 25d0c88251
9 changed files with 83 additions and 12 deletions
--- a/debian/libwireshark0.symbols
+++ b/debian/libwireshark0.symbols
@ -1883,6 +1883,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
 tvb_uncompress_lz77huff@Base 3.1.0
 tvb_uncompress_lznt1@Base 3.1.0
 tvb_unicode_strsize@Base 1.9.1
+ tvb_utf_8_isprint@Base 3.7.0
 tvb_ws_mempbrk_pattern_guint8@Base 1.99.3
 tvbparse_casestring@Base 1.9.1
 tvbparse_char@Base 1.9.1
--- a/doc/README.dissector
+++ b/doc/README.dissector
@ -132,7 +132,8 @@ FIELDDISPLAY    --For FT_UINT{8,16,24,32,40,48,56,64} and
                  BASE_CUSTOM, or BASE_NONE, possibly ORed with
                  BASE_RANGE_STRING, BASE_EXT_STRING, BASE_VAL64_STRING,
                  BASE_ALLOW_ZERO, BASE_UNIT_STRING, BASE_SPECIAL_VALS,
-                  BASE_NO_DISPLAY_VALUE, or BASE_SHOW_ASCII_PRINTABLE
+                  BASE_NO_DISPLAY_VALUE, BASE_SHOW_ASCII_PRINTABLE, or
+                  BASE_SHOW_UTF_8_PRINTABLE

                  BASE_NONE may be used with a non-NULL FIELDCONVERT when the
                  numeric value of the field itself is not of significance to
@ -182,8 +183,8 @@ FIELDDISPLAY    --For FT_UINT{8,16,24,32,40,48,56,64} and

                  SEP_DOT, SEP_DASH, SEP_COLON, or SEP_SPACE to provide
                  a separator between bytes; BASE_NONE has no separator
-                  between bytes.  These can be ORed with BASE_ALLOW_ZERO
-                  and BASE_SHOW_ASCII_PRINTABLE.
+                  between bytes.  These can be ORed with BASE_ALLOW_ZERO,
+                  BASE_SHOW_ASCII_PRINTABLE, or BASE_SHOW_UTF_8_PRINTABLE.

                  BASE_ALLOW_ZERO displays <none> instead of <MISSING>
                  for a zero-sized byte array.
@ -192,6 +193,11 @@ FIELDDISPLAY    --For FT_UINT{8,16,24,32,40,48,56,64} and
                  characters and, if so, will display the field's value
                  as a string, in quotes.  The value will still be
                  filterable as a byte value.
+                  BASE_SHOW_UTF_8_PRINTABLE will check whether the
+                  field's value is valid UTF-8 consisting entirely of
+                  printable characters and, if so, will display the field's
+                  value as a string, in quotes.  The value will still be
+                  filterable as a byte value.

                --For FT_IPv4:

--- a/epan/introspection-enums.c
+++ b/epan/introspection-enums.c
@ -59,6 +59,7 @@ static ws_enum_t all_enums[] = {
    ENUM(BASE_PT_UDP),
    ENUM(BASE_RANGE_STRING),
    ENUM(BASE_SHOW_ASCII_PRINTABLE),
+    ENUM(BASE_SHOW_UTF_8_PRINTABLE),
    ENUM(BASE_SPECIAL_VALS),
    ENUM(BASE_UNIT_STRING),
    ENUM(BASE_VAL64_STRING),
--- a/epan/proto.c
+++ b/epan/proto.c
@ -1082,7 +1082,18 @@ hfinfo_format_bytes(wmem_allocator_t *scope, const header_field_info *hfinfo,
 	gboolean is_printable;

 	if (bytes) {
-		if (hfinfo->display & BASE_SHOW_ASCII_PRINTABLE) {
+		if (hfinfo->display & BASE_SHOW_UTF_8_PRINTABLE) {
+			/*
+			 * If all bytes are valid and printable UTF-8, show the
+			 * bytes as a string - in quotes to indicate that it's
+			 * a string.
+			 */
+			if (isprint_utf8_string(bytes, length)) {
+				str = wmem_strdup_printf(scope, "\"%.*s\"",
+				    (int)length, bytes);
+				return str;
+			}
+		} else if (hfinfo->display & BASE_SHOW_ASCII_PRINTABLE) {
 			/*
 			 * Check whether all bytes are printable.
 			 */
--- a/epan/proto.h
+++ b/epan/proto.h
@ -718,6 +718,8 @@ typedef enum {

 #define BASE_SHOW_ASCII_PRINTABLE 0x00010000 /**< show byte array as ASCII if it's all printable characters */

+#define BASE_SHOW_UTF_8_PRINTABLE 0x00020000 /**< show byte array as UTF-8 if it's all valid and printable UTF-8 characters */
+
 /** BASE_ values that cause the field value to be displayed twice */
 #define IS_BASE_DUAL(b) ((b)==BASE_DEC_HEX||(b)==BASE_HEX_DEC)

--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@ -3884,6 +3884,18 @@ gboolean tvb_ascii_isprint(tvbuff_t *tvb, const gint offset, const gint length)
 	return TRUE;
 }

+gboolean tvb_utf_8_isprint(tvbuff_t *tvb, const gint offset, const gint length)
+{
+	const guint8* buf = tvb_get_ptr(tvb, offset, length);
+	guint abs_offset, abs_length = length;
+
+	if (length == -1) {
+		/* tvb_get_ptr has already checked for exceptions. */
+		compute_offset_and_remaining(tvb, offset, &abs_offset, &abs_length);
+	}
+
+	return isprint_utf8_string(buf, abs_length);
+}

 static ws_mempbrk_pattern pbrk_crlf;
 /*
--- a/epan/tvbuff.h
+++ b/epan/tvbuff.h
@ -809,6 +809,18 @@ WS_DLL_PUBLIC gint tvb_get_raw_bytes_as_string(tvbuff_t *tvb, const gint offset,
 WS_DLL_PUBLIC gboolean tvb_ascii_isprint(tvbuff_t *tvb, const gint offset,
 	const gint length);

+/** Iterates over the provided portion of the tvb checking that it is
+* valid UTF-8 consisting entirely of printable characters. (The characters
+* must be complete; if the portion ends in a partial sequence that could
+* begin a valid character, this returns FALSE.) The length may be -1 for
+* "all the way to the end of the tvbuff".
+* Returns TRUE if printable, FALSE otherwise
+*
+* @see isprint_utf8_string()
+*/
+WS_DLL_PUBLIC gboolean tvb_utf_8_isprint(tvbuff_t *tvb, const gint offset,
+	const gint length);
+
 /**
 * Given a tvbuff, an offset into the tvbuff, and a length that starts
 * at that offset (which may be -1 for "all the way to the end of the
--- a/wsutil/str_util.c
+++ b/wsutil/str_util.c
@ -273,18 +273,24 @@ isprint_string(const gchar *str)

 /* Check if an entire UTF-8 string is printable. */
 gboolean
-isprint_utf8_string(const gchar *str, guint length)
+isprint_utf8_string(const gchar *str, const guint length)
 {
-    const char *c;
+    const gchar *strend = str + length;

-    if (!g_utf8_validate (str, length, NULL)) {
+    if (!g_utf8_validate_len(str, length, NULL)) {
        return FALSE;
    }

-    for (c = str; *c; c = g_utf8_next_char(c)) {
-        if (!g_unichar_isprint(g_utf8_get_char(c))) {
+    while (str < strend) {
+        /* This returns false for G_UNICODE_CONTROL | G_UNICODE_FORMAT |
+         * G_UNICODE_UNASSIGNED | G_UNICODE_SURROGATE
+         * XXX: Could it be ok to have certain format characters, e.g.
+         * U+00AD SOFT HYPHEN? If so, format_text() should be changed too.
+         */
+        if (!g_unichar_isprint(g_utf8_get_char(str))) {
            return FALSE;
        }
+        str = g_utf8_next_char(str);
    }

    return TRUE;
--- a/wsutil/str_util.h
+++ b/wsutil/str_util.h
@ -114,14 +114,34 @@ gchar *ascii_strup_inplace(gchar *str);
 WS_DLL_PUBLIC
 gboolean isprint_string(const gchar *str);

-/** Check if an entire UTF-8 string consists of printable characters
+/** Given a not-necessarily-null-terminated string, expected to be in
+ *  UTF-8 but possibly containing invalid sequences (as it may have come
+ *  from packet data), and the length of the string, deterimine if the
+ *  string is valid UTF-8 consisting entirely of printable characters.
+ *
+ *  This means that it:
+ *
+ *   does not contain an illegal UTF-8 sequence (including overlong encodings,
+ *   the sequences reserved for UTF-16 surrogate halves, and the values for
+ *   code points above U+10FFFF that are no longer in Unicode)
+ *
+ *   does not contain a non-printable Unicode character such as control
+ *   characters (including internal NULL bytes)
+ *
+ *   does not end in a partial sequence that could begin a valid character;
+ *
+ *   does not start with a partial sequence that could end a valid character;
+ *
+ * and thus guarantees that the result of format_text() would be the same as
+ * that of wmem_strndup() with the same parameters.
 *
 * @param str    The string to be checked
 * @param length The number of bytes to validate
- * @return       TRUE if the entire string is printable, otherwise FALSE
+ * @return       TRUE if the entire string is valid and printable UTF-8,
+ *               otherwise FALSE
 */
 WS_DLL_PUBLIC
-gboolean isprint_utf8_string(const gchar *str, guint length);
+gboolean isprint_utf8_string(const gchar *str, const guint length);

 /** Check if an entire string consists of digits
 *