epan: Add ENC_BOM modifier for UTF-16, UCS-2, UCS-4

Add ENC_BOM to the list of bitflag modifiers, and use it with UTF-16, UCS-2, and UCS-4 (UTF-32). If set, this means that the first 2 (or 4) octets, if present, are checked to see if they are a Big-Endian BYTE ORDER MARK ("ZERO WIDTH NON-BREAKING SPACE"). If so, those octets are skipped and the encoding is set to Little-Endian or Big-Endian depending on endianness of the BOM. If the BOM is absent, the passed in Endianness flag is used normally. Related to #17991
2023-06-07 08:07:44 -04:00 · 2023-06-07 08:07:44 -04:00 · 1744ce4a0f
parent bda350d8fd
commit 1744ce4a0f
7 changed files with 98 additions and 34 deletions
--- a/doc/README.dissector
+++ b/doc/README.dissector
@ -1500,12 +1500,15 @@ order.
 For string fields, the encoding specifies the character set used for the
 string and the way individual code points in that character set are
 encoded. For FT_UINT_STRING fields, the byte order of the count must be
-specified; for UCS-2 and UTF-16, the byte order of the encoding must be
-specified (for counted UCS-2 and UTF-16 strings, the byte order of the
-count and the 16-bit values in the string must be the same). In other
-cases the string encoding has no endianness or the endianness is implicitly
-specified and nothing should be used. The character encodings that are
-currently supported are:
+specified. For UTF-16, UCS-2, and UCS-4, the byte order of the encoding
+must be specified, and optionally ENC_BOM can also be indicated to detect
+an initial BYTE ORDER MARK (the specified value is used if the field does
+not begin with a BOM.) For counted UTF-16, UCS-2, and UCS-4 strings, the
+byte order of the count and the characters in the string must be the same,
+unless a BOM overrides the value for the characters. In other cases the
+string encoding has no endianness or the endianness is implicitly specified
+and nothing should be used. The character encodings that are currently
+supported are:

    ENC_ASCII - ASCII (currently treated as UTF-8; in the future,
        all bytes with the 8th bit set will be treated as
@ -1514,7 +1517,7 @@ currently supported are:
    ENC_UTF_16 - UTF-16-encoded Unicode, with surrogate pairs
    ENC_UCS_2 - UCS-2-encoded subset of Unicode, with no surrogate pairs
        and thus no code points above 0xFFFF
-    ENC_UCS_4 - UCS-4-encoded Unicode
+    ENC_UCS_4 - UCS-4-encoded Unicode (aka UTF-32)
    ENC_WINDOWS_1250 - Windows-1250 code page
    ENC_WINDOWS_1251 - Windows-1251 code page
    ENC_WINDOWS_1252 - Windows-1252 code page
--- a/epan/charsets.c
+++ b/epan/charsets.c
@ -28,6 +28,9 @@
 */
 #define UNREPL UNICODE_REPLACEMENT_CHARACTER

+/* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */
+#define BYTE_ORDER_MARK 0xFEFF
+
 /*
 * Wikipedia's "Character encoding" template, giving a pile of character
 * encodings and Wikipedia pages for them:
@ -699,23 +702,36 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
 * Unicode, and return a pointer to a UTF-8 string, allocated with the
 * wmem scope.
 *
- * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
+ * possibly ORed with ENC_BOM.
 *
 * Specify length in bytes.
 */
 guint8 *
-get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
+get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
 {
    gunichar2      uchar;
-    gint           i;       /* Byte counter for string */
+    gint           i = 0;       /* Byte counter for string */
    wmem_strbuf_t *strbuf;

    strbuf = wmem_strbuf_new_sized(scope, length+1);

-    for(i = 0; i + 1 < length; i += 2) {
-        if (encoding == ENC_BIG_ENDIAN){
+    if (encoding & ENC_BOM && length >= 2) {
+        if (pletoh16(ptr) == BYTE_ORDER_MARK) {
+            encoding = ENC_LITTLE_ENDIAN;
+            i += 2;
+        } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
+            encoding = ENC_BIG_ENDIAN;
+            i += 2;
+        }
+    }
+
+    encoding = encoding & ENC_LITTLE_ENDIAN;
+
+    for(; i + 1 < length; i += 2) {
+        if (encoding == ENC_BIG_ENDIAN) {
            uchar = pntoh16(ptr + i);
-        }else{
+        } else {
            uchar = pletoh16(ptr + i);
        }
        wmem_strbuf_append_unichar_validated(strbuf, uchar);
@ -738,21 +754,34 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 *
 * See RFC 2781 section 2.2.
 *
- * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
+ * possibly ORed with ENC_BOM.
 *
 * Specify length in bytes.
 */
 guint8 *
-get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
+get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
 {
    wmem_strbuf_t *strbuf;
    gunichar2      uchar2, lead_surrogate;
    gunichar       uchar;
-    gint           i;       /* Byte counter for string */
+    gint           i = 0;       /* Byte counter for string */

    strbuf = wmem_strbuf_new_sized(scope, length+1);

-    for(i = 0; i + 1 < length; i += 2) {
+    if (encoding & ENC_BOM && length >= 2) {
+        if (pletoh16(ptr) == BYTE_ORDER_MARK) {
+            encoding = ENC_LITTLE_ENDIAN;
+            i += 2;
+        } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
+            encoding = ENC_BIG_ENDIAN;
+            i += 2;
+        }
+    }
+
+    encoding = encoding & ENC_LITTLE_ENDIAN;
+
+    for(; i + 1 < length; i += 2) {
        if (encoding == ENC_BIG_ENDIAN)
            uchar2 = pntoh16(ptr + i);
        else
@ -831,15 +860,27 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 * Specify length in bytes
 */
 guint8 *
-get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
+get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
 {
    gunichar       uchar;
-    gint           i;       /* Byte counter for string */
+    gint           i = 0;       /* Byte counter for string */
    wmem_strbuf_t *strbuf;

    strbuf = wmem_strbuf_new_sized(scope, length+1);

-    for(i = 0; i + 3 < length; i += 4) {
+    if (encoding & ENC_BOM && length >= 4) {
+        if (pletoh32(ptr) == BYTE_ORDER_MARK) {
+            encoding = ENC_LITTLE_ENDIAN;
+            i += 4;
+        } else if (pntoh32(ptr) == BYTE_ORDER_MARK) {
+            encoding = ENC_BIG_ENDIAN;
+            i += 4;
+        }
+    }
+
+    encoding = encoding & ENC_LITTLE_ENDIAN;
+
+    for(; i + 3 < length; i += 4) {
        if (encoding == ENC_BIG_ENDIAN)
            uchar = pntoh32(ptr + i);
        else
--- a/epan/charsets.h
+++ b/epan/charsets.h
@ -128,12 +128,13 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
 * Unicode, and return a pointer to a UTF-8 string, allocated with the
 * wmem scope.
 *
- * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
+ * possibly ORed with ENC_BOM.
 *
 * Specify length in bytes.
 */
 WS_DLL_PUBLIC guint8 *
-get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
+get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);

 /*
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
@ -142,24 +143,26 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 *
 * See RFC 2781 section 2.2.
 *
- * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
+ * possibly ORed with ENC_BOM.
 *
 * Specify length in bytes.
 */
 WS_DLL_PUBLIC guint8 *
-get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
+get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);

 /*
 * Given a wmem scope, a pointer, and a length, treat the string of bytes
 * referred to by the pointer and length as a UCS-4 encoded string, and
 * return a pointer to a UTF-8 string, allocated with the wmem scope.
 *
- * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
+ * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
+ * possibly ORed with ENC_BOM.
 *
- * Specify length in bytes
+ * Specify length in bytes.
 */
 WS_DLL_PUBLIC guint8 *
-get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
+get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);

 WS_DLL_PUBLIC guint8 *
 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
--- a/epan/iana_charsets.h
+++ b/epan/iana_charsets.h
@ -187,7 +187,7 @@ WS_DLL_PUBLIC value_string_ext mibenum_vals_character_sets_ext;
    ZZZ(0, YYY(XXX, IANA_CS_UTF_7,                     1012, "UTF-7",               ENC_NA|_DEFAULT_WS_ENC)) \
    ZZZ(1, YYY(XXX, IANA_CS_UTF_16BE,                  1013, "UTF-16BE",            ENC_BIG_ENDIAN|ENC_UTF_16)) \
    ZZZ(1, YYY(XXX, IANA_CS_UTF_16LE,                  1014, "UTF-16LE",            ENC_LITTLE_ENDIAN|ENC_UTF_16)) \
-    ZZZ(1, YYY(XXX, IANA_CS_UTF_16,                    1015, "UTF-16",              ENC_LITTLE_ENDIAN|ENC_UTF_16)) \
+    ZZZ(1, YYY(XXX, IANA_CS_UTF_16,                    1015, "UTF-16",              ENC_LITTLE_ENDIAN|ENC_BOM|ENC_UTF_16)) \
    ZZZ(0, YYY(XXX, IANA_CS_CESU_8,                    1016, "CESU-8",              ENC_NA|_DEFAULT_WS_ENC)) \
    ZZZ(0, YYY(XXX, IANA_CS_UTF_32,                    1017, "UTF-32",              ENC_NA|_DEFAULT_WS_ENC)) \
    ZZZ(0, YYY(XXX, IANA_CS_UTF_32BE,                  1018, "UTF-32BE",            ENC_NA|_DEFAULT_WS_ENC)) \
@ -316,6 +316,12 @@ WS_DLL_PUBLIC value_string_ext mibenum_vals_character_sets_ext;
    ZZZ(0, YYY(XXX, IANA_CS_CP50220,                   2260, "CP50220",             ENC_NA|_DEFAULT_WS_ENC))
 /*  ZZZ(Mark,....., IANA_ENUM,                     IANA_VAL, IANA_NAME,             WIRESHARK_ENCODING */

+/* RFC 2781 suggests that 1015 "UTF-16" (UTF-16 with BOM) SHOULD be
+ * interpreted as ENC_BIG_ENDIAN if the BOM is missing, but in practice
+ * it's more common to see the encoding as Little Endian, especially if
+ * a BOM is expected.
+ */
+
 /* select all records */
 #define ICWE_SELECT_ALL(N, ...)   ICWE_SELECT_ALL_##N(__VA_ARGS__)
 #define ICWE_SELECT_ALL_0(...)    __VA_ARGS__
--- a/epan/introspection-enums.c
+++ b/epan/introspection-enums.c
@ -82,6 +82,7 @@ static ws_enum_t all_enums[] = {
    ENUM(ENC_BCD_ODD_NUM_DIG),
    ENUM(ENC_BCD_SKIP_FIRST),
    ENUM(ENC_BIG_ENDIAN),
+    ENUM(ENC_BOM),
    ENUM(ENC_CHARENCODING_MASK),
    ENUM(ENC_CP437),
    ENUM(ENC_CP855),
--- a/epan/proto.h
+++ b/epan/proto.h
@ -460,6 +460,16 @@ void proto_report_dissector_bug(const char *format, ...)
 */
 #define ENC_ZIGBEE               0x40000000

+/*
+ * This is a modifier for ENC_UTF_16, ENC_UCS_2, and ENC_UCS_4
+ * indicating that if the first two (or four, for UCS-4) octets
+ * are a big-endian or little-endian BOM, use that to determine
+ * the serialization order and ignore the ENC_LITTLE_ENDIAN or
+ * ENC_BIG_ENDIAN flag. This can't collide with ENC_ZIGBEE because
+ * it could be used simultaneously.
+ */
+#define ENC_BOM                  0x20000000
+
 /*
 * For cases where either native type or string encodings could both be
 * valid arguments, we need something to distinguish which one is being
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@ -3137,17 +3137,17 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,

 	case ENC_UTF_16:
 		strptr = tvb_get_utf_16_string(scope, tvb, offset, length,
-		    encoding & ENC_LITTLE_ENDIAN);
+		    encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
 		break;

 	case ENC_UCS_2:
 		strptr = tvb_get_ucs_2_string(scope, tvb, offset, length,
-		    encoding & ENC_LITTLE_ENDIAN);
+		    encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
 		break;

 	case ENC_UCS_4:
 		strptr = tvb_get_ucs_4_string(scope, tvb, offset, length,
-		    encoding & ENC_LITTLE_ENDIAN);
+		    encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
 		break;

 	case ENC_ISO_8859_1:
@ -3633,17 +3633,17 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g

 	case ENC_UTF_16:
 		strptr = tvb_get_utf_16_stringz(scope, tvb, offset, lengthp,
-		    encoding & ENC_LITTLE_ENDIAN);
+		    encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
 		break;

 	case ENC_UCS_2:
 		strptr = tvb_get_ucs_2_stringz(scope, tvb, offset, lengthp,
-		    encoding & ENC_LITTLE_ENDIAN);
+		    encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
 		break;

 	case ENC_UCS_4:
 		strptr = tvb_get_ucs_4_stringz(scope, tvb, offset, lengthp,
-		    encoding & ENC_LITTLE_ENDIAN);
+		    encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
 		break;

 	case ENC_ISO_8859_1: