epan: Add ENC_BOM modifier for UTF-16, UCS-2, UCS-4

Add ENC_BOM to the list of bitflag modifiers, and use it with
UTF-16, UCS-2, and UCS-4 (UTF-32). If set, this means that the
first 2 (or 4) octets, if present, are checked to see if they are
a Big-Endian BYTE ORDER MARK ("ZERO WIDTH NON-BREAKING SPACE"). If so,
those octets are skipped and the encoding is set to Little-Endian
or Big-Endian depending on endianness of the BOM.

If the BOM is absent, the passed in Endianness flag is used normally.

Related to #17991
This commit is contained in:
John Thacker 2023-06-07 08:07:44 -04:00
parent bda350d8fd
commit 1744ce4a0f
7 changed files with 98 additions and 34 deletions

View File

@ -1500,12 +1500,15 @@ order.
For string fields, the encoding specifies the character set used for the
string and the way individual code points in that character set are
encoded. For FT_UINT_STRING fields, the byte order of the count must be
specified; for UCS-2 and UTF-16, the byte order of the encoding must be
specified (for counted UCS-2 and UTF-16 strings, the byte order of the
count and the 16-bit values in the string must be the same). In other
cases the string encoding has no endianness or the endianness is implicitly
specified and nothing should be used. The character encodings that are
currently supported are:
specified. For UTF-16, UCS-2, and UCS-4, the byte order of the encoding
must be specified, and optionally ENC_BOM can also be indicated to detect
an initial BYTE ORDER MARK (the specified value is used if the field does
not begin with a BOM.) For counted UTF-16, UCS-2, and UCS-4 strings, the
byte order of the count and the characters in the string must be the same,
unless a BOM overrides the value for the characters. In other cases the
string encoding has no endianness or the endianness is implicitly specified
and nothing should be used. The character encodings that are currently
supported are:
ENC_ASCII - ASCII (currently treated as UTF-8; in the future,
all bytes with the 8th bit set will be treated as
@ -1514,7 +1517,7 @@ currently supported are:
ENC_UTF_16 - UTF-16-encoded Unicode, with surrogate pairs
ENC_UCS_2 - UCS-2-encoded subset of Unicode, with no surrogate pairs
and thus no code points above 0xFFFF
ENC_UCS_4 - UCS-4-encoded Unicode
ENC_UCS_4 - UCS-4-encoded Unicode (aka UTF-32)
ENC_WINDOWS_1250 - Windows-1250 code page
ENC_WINDOWS_1251 - Windows-1251 code page
ENC_WINDOWS_1252 - Windows-1252 code page

View File

@ -28,6 +28,9 @@
*/
#define UNREPL UNICODE_REPLACEMENT_CHARACTER
/* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */
#define BYTE_ORDER_MARK 0xFEFF
/*
* Wikipedia's "Character encoding" template, giving a pile of character
* encodings and Wikipedia pages for them:
@ -699,23 +702,36 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
* Unicode, and return a pointer to a UTF-8 string, allocated with the
* wmem scope.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
* possibly ORed with ENC_BOM.
*
* Specify length in bytes.
*/
guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
{
gunichar2 uchar;
gint i; /* Byte counter for string */
gint i = 0; /* Byte counter for string */
wmem_strbuf_t *strbuf;
strbuf = wmem_strbuf_new_sized(scope, length+1);
for(i = 0; i + 1 < length; i += 2) {
if (encoding == ENC_BIG_ENDIAN){
if (encoding & ENC_BOM && length >= 2) {
if (pletoh16(ptr) == BYTE_ORDER_MARK) {
encoding = ENC_LITTLE_ENDIAN;
i += 2;
} else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
encoding = ENC_BIG_ENDIAN;
i += 2;
}
}
encoding = encoding & ENC_LITTLE_ENDIAN;
for(; i + 1 < length; i += 2) {
if (encoding == ENC_BIG_ENDIAN) {
uchar = pntoh16(ptr + i);
}else{
} else {
uchar = pletoh16(ptr + i);
}
wmem_strbuf_append_unichar_validated(strbuf, uchar);
@ -738,21 +754,34 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
*
* See RFC 2781 section 2.2.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
* possibly ORed with ENC_BOM.
*
* Specify length in bytes.
*/
guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
{
wmem_strbuf_t *strbuf;
gunichar2 uchar2, lead_surrogate;
gunichar uchar;
gint i; /* Byte counter for string */
gint i = 0; /* Byte counter for string */
strbuf = wmem_strbuf_new_sized(scope, length+1);
for(i = 0; i + 1 < length; i += 2) {
if (encoding & ENC_BOM && length >= 2) {
if (pletoh16(ptr) == BYTE_ORDER_MARK) {
encoding = ENC_LITTLE_ENDIAN;
i += 2;
} else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
encoding = ENC_BIG_ENDIAN;
i += 2;
}
}
encoding = encoding & ENC_LITTLE_ENDIAN;
for(; i + 1 < length; i += 2) {
if (encoding == ENC_BIG_ENDIAN)
uchar2 = pntoh16(ptr + i);
else
@ -831,15 +860,27 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Specify length in bytes
*/
guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
{
gunichar uchar;
gint i; /* Byte counter for string */
gint i = 0; /* Byte counter for string */
wmem_strbuf_t *strbuf;
strbuf = wmem_strbuf_new_sized(scope, length+1);
for(i = 0; i + 3 < length; i += 4) {
if (encoding & ENC_BOM && length >= 4) {
if (pletoh32(ptr) == BYTE_ORDER_MARK) {
encoding = ENC_LITTLE_ENDIAN;
i += 4;
} else if (pntoh32(ptr) == BYTE_ORDER_MARK) {
encoding = ENC_BIG_ENDIAN;
i += 4;
}
}
encoding = encoding & ENC_LITTLE_ENDIAN;
for(; i + 3 < length; i += 4) {
if (encoding == ENC_BIG_ENDIAN)
uchar = pntoh32(ptr + i);
else

View File

@ -128,12 +128,13 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
* Unicode, and return a pointer to a UTF-8 string, allocated with the
* wmem scope.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
* possibly ORed with ENC_BOM.
*
* Specify length in bytes.
*/
WS_DLL_PUBLIC guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
@ -142,24 +143,26 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
*
* See RFC 2781 section 2.2.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
* possibly ORed with ENC_BOM.
*
* Specify length in bytes.
*/
WS_DLL_PUBLIC guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UCS-4 encoded string, and
* return a pointer to a UTF-8 string, allocated with the wmem scope.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
* possibly ORed with ENC_BOM.
*
* Specify length in bytes
* Specify length in bytes.
*/
WS_DLL_PUBLIC guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);
WS_DLL_PUBLIC guint8 *
get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,

View File

@ -187,7 +187,7 @@ WS_DLL_PUBLIC value_string_ext mibenum_vals_character_sets_ext;
ZZZ(0, YYY(XXX, IANA_CS_UTF_7, 1012, "UTF-7", ENC_NA|_DEFAULT_WS_ENC)) \
ZZZ(1, YYY(XXX, IANA_CS_UTF_16BE, 1013, "UTF-16BE", ENC_BIG_ENDIAN|ENC_UTF_16)) \
ZZZ(1, YYY(XXX, IANA_CS_UTF_16LE, 1014, "UTF-16LE", ENC_LITTLE_ENDIAN|ENC_UTF_16)) \
ZZZ(1, YYY(XXX, IANA_CS_UTF_16, 1015, "UTF-16", ENC_LITTLE_ENDIAN|ENC_UTF_16)) \
ZZZ(1, YYY(XXX, IANA_CS_UTF_16, 1015, "UTF-16", ENC_LITTLE_ENDIAN|ENC_BOM|ENC_UTF_16)) \
ZZZ(0, YYY(XXX, IANA_CS_CESU_8, 1016, "CESU-8", ENC_NA|_DEFAULT_WS_ENC)) \
ZZZ(0, YYY(XXX, IANA_CS_UTF_32, 1017, "UTF-32", ENC_NA|_DEFAULT_WS_ENC)) \
ZZZ(0, YYY(XXX, IANA_CS_UTF_32BE, 1018, "UTF-32BE", ENC_NA|_DEFAULT_WS_ENC)) \
@ -316,6 +316,12 @@ WS_DLL_PUBLIC value_string_ext mibenum_vals_character_sets_ext;
ZZZ(0, YYY(XXX, IANA_CS_CP50220, 2260, "CP50220", ENC_NA|_DEFAULT_WS_ENC))
/* ZZZ(Mark,....., IANA_ENUM, IANA_VAL, IANA_NAME, WIRESHARK_ENCODING */
/* RFC 2781 suggests that 1015 "UTF-16" (UTF-16 with BOM) SHOULD be
* interpreted as ENC_BIG_ENDIAN if the BOM is missing, but in practice
* it's more common to see the encoding as Little Endian, especially if
* a BOM is expected.
*/
/* select all records */
#define ICWE_SELECT_ALL(N, ...) ICWE_SELECT_ALL_##N(__VA_ARGS__)
#define ICWE_SELECT_ALL_0(...) __VA_ARGS__

View File

@ -82,6 +82,7 @@ static ws_enum_t all_enums[] = {
ENUM(ENC_BCD_ODD_NUM_DIG),
ENUM(ENC_BCD_SKIP_FIRST),
ENUM(ENC_BIG_ENDIAN),
ENUM(ENC_BOM),
ENUM(ENC_CHARENCODING_MASK),
ENUM(ENC_CP437),
ENUM(ENC_CP855),

View File

@ -460,6 +460,16 @@ void proto_report_dissector_bug(const char *format, ...)
*/
#define ENC_ZIGBEE 0x40000000
/*
* This is a modifier for ENC_UTF_16, ENC_UCS_2, and ENC_UCS_4
* indicating that if the first two (or four, for UCS-4) octets
* are a big-endian or little-endian BOM, use that to determine
* the serialization order and ignore the ENC_LITTLE_ENDIAN or
* ENC_BIG_ENDIAN flag. This can't collide with ENC_ZIGBEE because
* it could be used simultaneously.
*/
#define ENC_BOM 0x20000000
/*
* For cases where either native type or string encodings could both be
* valid arguments, we need something to distinguish which one is being

View File

@ -3137,17 +3137,17 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
case ENC_UTF_16:
strptr = tvb_get_utf_16_string(scope, tvb, offset, length,
encoding & ENC_LITTLE_ENDIAN);
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
break;
case ENC_UCS_2:
strptr = tvb_get_ucs_2_string(scope, tvb, offset, length,
encoding & ENC_LITTLE_ENDIAN);
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
break;
case ENC_UCS_4:
strptr = tvb_get_ucs_4_string(scope, tvb, offset, length,
encoding & ENC_LITTLE_ENDIAN);
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
break;
case ENC_ISO_8859_1:
@ -3633,17 +3633,17 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
case ENC_UTF_16:
strptr = tvb_get_utf_16_stringz(scope, tvb, offset, lengthp,
encoding & ENC_LITTLE_ENDIAN);
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
break;
case ENC_UCS_2:
strptr = tvb_get_ucs_2_stringz(scope, tvb, offset, lengthp,
encoding & ENC_LITTLE_ENDIAN);
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
break;
case ENC_UCS_4:
strptr = tvb_get_ucs_4_stringz(scope, tvb, offset, lengthp,
encoding & ENC_LITTLE_ENDIAN);
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
break;
case ENC_ISO_8859_1: