epan: Add ENC_BOM modifier for UTF-16, UCS-2, UCS-4
Add ENC_BOM to the list of bitflag modifiers, and use it with UTF-16, UCS-2, and UCS-4 (UTF-32). If set, this means that the first 2 (or 4) octets, if present, are checked to see if they are a Big-Endian BYTE ORDER MARK ("ZERO WIDTH NON-BREAKING SPACE"). If so, those octets are skipped and the encoding is set to Little-Endian or Big-Endian depending on endianness of the BOM. If the BOM is absent, the passed in Endianness flag is used normally. Related to #17991
This commit is contained in:
parent
bda350d8fd
commit
1744ce4a0f
|
@ -1500,12 +1500,15 @@ order.
|
|||
For string fields, the encoding specifies the character set used for the
|
||||
string and the way individual code points in that character set are
|
||||
encoded. For FT_UINT_STRING fields, the byte order of the count must be
|
||||
specified; for UCS-2 and UTF-16, the byte order of the encoding must be
|
||||
specified (for counted UCS-2 and UTF-16 strings, the byte order of the
|
||||
count and the 16-bit values in the string must be the same). In other
|
||||
cases the string encoding has no endianness or the endianness is implicitly
|
||||
specified and nothing should be used. The character encodings that are
|
||||
currently supported are:
|
||||
specified. For UTF-16, UCS-2, and UCS-4, the byte order of the encoding
|
||||
must be specified, and optionally ENC_BOM can also be indicated to detect
|
||||
an initial BYTE ORDER MARK (the specified value is used if the field does
|
||||
not begin with a BOM.) For counted UTF-16, UCS-2, and UCS-4 strings, the
|
||||
byte order of the count and the characters in the string must be the same,
|
||||
unless a BOM overrides the value for the characters. In other cases the
|
||||
string encoding has no endianness or the endianness is implicitly specified
|
||||
and nothing should be used. The character encodings that are currently
|
||||
supported are:
|
||||
|
||||
ENC_ASCII - ASCII (currently treated as UTF-8; in the future,
|
||||
all bytes with the 8th bit set will be treated as
|
||||
|
@ -1514,7 +1517,7 @@ currently supported are:
|
|||
ENC_UTF_16 - UTF-16-encoded Unicode, with surrogate pairs
|
||||
ENC_UCS_2 - UCS-2-encoded subset of Unicode, with no surrogate pairs
|
||||
and thus no code points above 0xFFFF
|
||||
ENC_UCS_4 - UCS-4-encoded Unicode
|
||||
ENC_UCS_4 - UCS-4-encoded Unicode (aka UTF-32)
|
||||
ENC_WINDOWS_1250 - Windows-1250 code page
|
||||
ENC_WINDOWS_1251 - Windows-1251 code page
|
||||
ENC_WINDOWS_1252 - Windows-1252 code page
|
||||
|
|
|
@ -28,6 +28,9 @@
|
|||
*/
|
||||
#define UNREPL UNICODE_REPLACEMENT_CHARACTER
|
||||
|
||||
/* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */
|
||||
#define BYTE_ORDER_MARK 0xFEFF
|
||||
|
||||
/*
|
||||
* Wikipedia's "Character encoding" template, giving a pile of character
|
||||
* encodings and Wikipedia pages for them:
|
||||
|
@ -699,23 +702,36 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
|
|||
* Unicode, and return a pointer to a UTF-8 string, allocated with the
|
||||
* wmem scope.
|
||||
*
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
|
||||
* possibly ORed with ENC_BOM.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*/
|
||||
guint8 *
|
||||
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
|
||||
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
|
||||
{
|
||||
gunichar2 uchar;
|
||||
gint i; /* Byte counter for string */
|
||||
gint i = 0; /* Byte counter for string */
|
||||
wmem_strbuf_t *strbuf;
|
||||
|
||||
strbuf = wmem_strbuf_new_sized(scope, length+1);
|
||||
|
||||
for(i = 0; i + 1 < length; i += 2) {
|
||||
if (encoding == ENC_BIG_ENDIAN){
|
||||
if (encoding & ENC_BOM && length >= 2) {
|
||||
if (pletoh16(ptr) == BYTE_ORDER_MARK) {
|
||||
encoding = ENC_LITTLE_ENDIAN;
|
||||
i += 2;
|
||||
} else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
|
||||
encoding = ENC_BIG_ENDIAN;
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
encoding = encoding & ENC_LITTLE_ENDIAN;
|
||||
|
||||
for(; i + 1 < length; i += 2) {
|
||||
if (encoding == ENC_BIG_ENDIAN) {
|
||||
uchar = pntoh16(ptr + i);
|
||||
}else{
|
||||
} else {
|
||||
uchar = pletoh16(ptr + i);
|
||||
}
|
||||
wmem_strbuf_append_unichar_validated(strbuf, uchar);
|
||||
|
@ -738,21 +754,34 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
*
|
||||
* See RFC 2781 section 2.2.
|
||||
*
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
|
||||
* possibly ORed with ENC_BOM.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*/
|
||||
guint8 *
|
||||
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
|
||||
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
|
||||
{
|
||||
wmem_strbuf_t *strbuf;
|
||||
gunichar2 uchar2, lead_surrogate;
|
||||
gunichar uchar;
|
||||
gint i; /* Byte counter for string */
|
||||
gint i = 0; /* Byte counter for string */
|
||||
|
||||
strbuf = wmem_strbuf_new_sized(scope, length+1);
|
||||
|
||||
for(i = 0; i + 1 < length; i += 2) {
|
||||
if (encoding & ENC_BOM && length >= 2) {
|
||||
if (pletoh16(ptr) == BYTE_ORDER_MARK) {
|
||||
encoding = ENC_LITTLE_ENDIAN;
|
||||
i += 2;
|
||||
} else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
|
||||
encoding = ENC_BIG_ENDIAN;
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
encoding = encoding & ENC_LITTLE_ENDIAN;
|
||||
|
||||
for(; i + 1 < length; i += 2) {
|
||||
if (encoding == ENC_BIG_ENDIAN)
|
||||
uchar2 = pntoh16(ptr + i);
|
||||
else
|
||||
|
@ -831,15 +860,27 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
* Specify length in bytes
|
||||
*/
|
||||
guint8 *
|
||||
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
|
||||
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding)
|
||||
{
|
||||
gunichar uchar;
|
||||
gint i; /* Byte counter for string */
|
||||
gint i = 0; /* Byte counter for string */
|
||||
wmem_strbuf_t *strbuf;
|
||||
|
||||
strbuf = wmem_strbuf_new_sized(scope, length+1);
|
||||
|
||||
for(i = 0; i + 3 < length; i += 4) {
|
||||
if (encoding & ENC_BOM && length >= 4) {
|
||||
if (pletoh32(ptr) == BYTE_ORDER_MARK) {
|
||||
encoding = ENC_LITTLE_ENDIAN;
|
||||
i += 4;
|
||||
} else if (pntoh32(ptr) == BYTE_ORDER_MARK) {
|
||||
encoding = ENC_BIG_ENDIAN;
|
||||
i += 4;
|
||||
}
|
||||
}
|
||||
|
||||
encoding = encoding & ENC_LITTLE_ENDIAN;
|
||||
|
||||
for(; i + 3 < length; i += 4) {
|
||||
if (encoding == ENC_BIG_ENDIAN)
|
||||
uchar = pntoh32(ptr + i);
|
||||
else
|
||||
|
|
|
@ -128,12 +128,13 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
|
|||
* Unicode, and return a pointer to a UTF-8 string, allocated with the
|
||||
* wmem scope.
|
||||
*
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
|
||||
* possibly ORed with ENC_BOM.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*/
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
||||
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);
|
||||
|
||||
/*
|
||||
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
||||
|
@ -142,24 +143,26 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
*
|
||||
* See RFC 2781 section 2.2.
|
||||
*
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
|
||||
* possibly ORed with ENC_BOM.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*/
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
||||
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);
|
||||
|
||||
/*
|
||||
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
||||
* referred to by the pointer and length as a UCS-4 encoded string, and
|
||||
* return a pointer to a UTF-8 string, allocated with the wmem scope.
|
||||
*
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
|
||||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
|
||||
* possibly ORed with ENC_BOM.
|
||||
*
|
||||
* Specify length in bytes
|
||||
* Specify length in bytes.
|
||||
*/
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
||||
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, guint encoding);
|
||||
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
|
||||
|
|
|
@ -187,7 +187,7 @@ WS_DLL_PUBLIC value_string_ext mibenum_vals_character_sets_ext;
|
|||
ZZZ(0, YYY(XXX, IANA_CS_UTF_7, 1012, "UTF-7", ENC_NA|_DEFAULT_WS_ENC)) \
|
||||
ZZZ(1, YYY(XXX, IANA_CS_UTF_16BE, 1013, "UTF-16BE", ENC_BIG_ENDIAN|ENC_UTF_16)) \
|
||||
ZZZ(1, YYY(XXX, IANA_CS_UTF_16LE, 1014, "UTF-16LE", ENC_LITTLE_ENDIAN|ENC_UTF_16)) \
|
||||
ZZZ(1, YYY(XXX, IANA_CS_UTF_16, 1015, "UTF-16", ENC_LITTLE_ENDIAN|ENC_UTF_16)) \
|
||||
ZZZ(1, YYY(XXX, IANA_CS_UTF_16, 1015, "UTF-16", ENC_LITTLE_ENDIAN|ENC_BOM|ENC_UTF_16)) \
|
||||
ZZZ(0, YYY(XXX, IANA_CS_CESU_8, 1016, "CESU-8", ENC_NA|_DEFAULT_WS_ENC)) \
|
||||
ZZZ(0, YYY(XXX, IANA_CS_UTF_32, 1017, "UTF-32", ENC_NA|_DEFAULT_WS_ENC)) \
|
||||
ZZZ(0, YYY(XXX, IANA_CS_UTF_32BE, 1018, "UTF-32BE", ENC_NA|_DEFAULT_WS_ENC)) \
|
||||
|
@ -316,6 +316,12 @@ WS_DLL_PUBLIC value_string_ext mibenum_vals_character_sets_ext;
|
|||
ZZZ(0, YYY(XXX, IANA_CS_CP50220, 2260, "CP50220", ENC_NA|_DEFAULT_WS_ENC))
|
||||
/* ZZZ(Mark,....., IANA_ENUM, IANA_VAL, IANA_NAME, WIRESHARK_ENCODING */
|
||||
|
||||
/* RFC 2781 suggests that 1015 "UTF-16" (UTF-16 with BOM) SHOULD be
|
||||
* interpreted as ENC_BIG_ENDIAN if the BOM is missing, but in practice
|
||||
* it's more common to see the encoding as Little Endian, especially if
|
||||
* a BOM is expected.
|
||||
*/
|
||||
|
||||
/* select all records */
|
||||
#define ICWE_SELECT_ALL(N, ...) ICWE_SELECT_ALL_##N(__VA_ARGS__)
|
||||
#define ICWE_SELECT_ALL_0(...) __VA_ARGS__
|
||||
|
|
|
@ -82,6 +82,7 @@ static ws_enum_t all_enums[] = {
|
|||
ENUM(ENC_BCD_ODD_NUM_DIG),
|
||||
ENUM(ENC_BCD_SKIP_FIRST),
|
||||
ENUM(ENC_BIG_ENDIAN),
|
||||
ENUM(ENC_BOM),
|
||||
ENUM(ENC_CHARENCODING_MASK),
|
||||
ENUM(ENC_CP437),
|
||||
ENUM(ENC_CP855),
|
||||
|
|
10
epan/proto.h
10
epan/proto.h
|
@ -460,6 +460,16 @@ void proto_report_dissector_bug(const char *format, ...)
|
|||
*/
|
||||
#define ENC_ZIGBEE 0x40000000
|
||||
|
||||
/*
|
||||
* This is a modifier for ENC_UTF_16, ENC_UCS_2, and ENC_UCS_4
|
||||
* indicating that if the first two (or four, for UCS-4) octets
|
||||
* are a big-endian or little-endian BOM, use that to determine
|
||||
* the serialization order and ignore the ENC_LITTLE_ENDIAN or
|
||||
* ENC_BIG_ENDIAN flag. This can't collide with ENC_ZIGBEE because
|
||||
* it could be used simultaneously.
|
||||
*/
|
||||
#define ENC_BOM 0x20000000
|
||||
|
||||
/*
|
||||
* For cases where either native type or string encodings could both be
|
||||
* valid arguments, we need something to distinguish which one is being
|
||||
|
|
|
@ -3137,17 +3137,17 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
|
|||
|
||||
case ENC_UTF_16:
|
||||
strptr = tvb_get_utf_16_string(scope, tvb, offset, length,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
|
||||
break;
|
||||
|
||||
case ENC_UCS_2:
|
||||
strptr = tvb_get_ucs_2_string(scope, tvb, offset, length,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
|
||||
break;
|
||||
|
||||
case ENC_UCS_4:
|
||||
strptr = tvb_get_ucs_4_string(scope, tvb, offset, length,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
|
||||
break;
|
||||
|
||||
case ENC_ISO_8859_1:
|
||||
|
@ -3633,17 +3633,17 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
|
|||
|
||||
case ENC_UTF_16:
|
||||
strptr = tvb_get_utf_16_stringz(scope, tvb, offset, lengthp,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
|
||||
break;
|
||||
|
||||
case ENC_UCS_2:
|
||||
strptr = tvb_get_ucs_2_stringz(scope, tvb, offset, lengthp,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
|
||||
break;
|
||||
|
||||
case ENC_UCS_4:
|
||||
strptr = tvb_get_ucs_4_stringz(scope, tvb, offset, lengthp,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
encoding & (ENC_LITTLE_ENDIAN|ENC_BOM));
|
||||
break;
|
||||
|
||||
case ENC_ISO_8859_1:
|
||||
|
|
Loading…
Reference in New Issue