forked from osmocom/wireshark
Add ENC_UTF_16 and ENC_UCS_2. Note that UTF-16 and UCS-2 are not the
same, and that the routines to get "Unicode" strings are really doing UCS-2 (and not doing anything about code values that aren't valid in UCS-2 strings). Have tvb_get_ephemeral_string_enc() separate cases for ASCII and UTF-8, even though they're *currently* treated the same. For FT_UINT_STRING, treat an encoding value of TRUE as meaning "little-endian ASCII"; pass all other encodings through to tvb_get_ephemeral_string_enc(). svn path=/trunk/; revision=42592
This commit is contained in:
parent
bb524c6b64
commit
7a87d7b6a3
20
epan/proto.c
20
epan/proto.c
|
@ -1484,20 +1484,16 @@ proto_tree_new_item(field_info *new_fi, proto_tree *tree,
|
|||
* NOTE: to support code written when
|
||||
* proto_tree_add_item() took a gboolean as its
|
||||
* last argument, with FALSE meaning "big-endian"
|
||||
* and TRUE meaning "little-endian", we any
|
||||
* non-zero value of "encoding", except for
|
||||
* ENC_EBCDIC|ENC_BIG_ENDIAN and
|
||||
* ENC_EBCDIC|ENC_LITTLE_ENDIAN as meaning
|
||||
* "little-endian UTF-8".
|
||||
* and TRUE meaning "little-endian", if the
|
||||
* encoding value is TRUE, treat that as
|
||||
* ASCII with a little-endian length.
|
||||
*
|
||||
* At some point in the future, we might
|
||||
* support more character encodings in the
|
||||
* encoding value as well.
|
||||
* This won't work for code that passes
|
||||
* arbitrary non-zero values; that code
|
||||
* will need to be fixed.
|
||||
*/
|
||||
if (encoding != 0 &&
|
||||
encoding != (ENC_EBCDIC|ENC_BIG_ENDIAN) &&
|
||||
encoding != (ENC_EBCDIC|ENC_LITTLE_ENDIAN))
|
||||
encoding = ENC_UTF_8|ENC_LITTLE_ENDIAN;
|
||||
if (encoding == TRUE)
|
||||
encoding = ENC_ASCII|ENC_LITTLE_ENDIAN;
|
||||
n = get_uint_value(tvb, start, length, encoding);
|
||||
proto_tree_set_string_tvb(new_fi, tvb, start + length, n,
|
||||
encoding);
|
||||
|
|
50
epan/proto.h
50
epan/proto.h
|
@ -245,41 +245,43 @@ typedef struct _protocol protocol_t;
|
|||
* was with FT_UINT_STRINGs, where we had FALSE for the string length
|
||||
* being big-endian and TRUE for it being little-endian.
|
||||
*
|
||||
* This is a quick and dirty hack for bug 6084, which doesn't require
|
||||
* support for multiple character encodings in FT_UINT_STRING. We
|
||||
* introduce ENC_UTF_8 and ENC_EBCDIC, with ENC_UTF_8 being 0 and
|
||||
* ENC_EBCDIC being the unlikely value 0x0EBCD000, and treat all values
|
||||
* other than ENC_EBCDIC as UTF-8. That way, no matter how a dissector
|
||||
* not converted to use ENC_ values calculates the last argument to
|
||||
* proto_tree_add_item(), it's unlikely to get EBCDIC.
|
||||
* We now have encoding values for the character encoding. The encoding
|
||||
* values are encoded in all but the top bit (which is the byte-order
|
||||
* bit, required for FT_UINT_STRING and for UCS-2 and UTF-16 strings)
|
||||
* and the bottom bit (which we ignore for now so that programs that
|
||||
* pass TRUE for the encoding just do ASCII).
|
||||
*
|
||||
* The value for ENC_EBCDIC is subject to change in a future release (or
|
||||
* to replacement with multiple values for different flavors of EBCDIC).
|
||||
* We don't yet process ASCII and UTF-8 differently. Ultimately, for
|
||||
* ASCII, all bytes with the 8th bit set should be mapped to some "this
|
||||
* is not a valid character" code point, as ENC_ASCII should mean "this
|
||||
* is ASCII, not some extended variant thereof". We should also map
|
||||
* 0x00 to that as well - null-terminated and null-padded strings
|
||||
* never have NULs in them, but counted strings might. (Either that,
|
||||
* or the values for strings should be counted, not null-terminated.)
|
||||
* For UTF-8, invalid UTF-8 sequences should be mapped to the same
|
||||
* code point.
|
||||
*
|
||||
* We currently add some additional encodings, for various ASCII-based
|
||||
* encodings, but use the same value as ENC_UTF_8, for now, so that we
|
||||
* can mark the appropriate encoding. Ultimately, we should handle
|
||||
* those encodings by mapping them to UTF-8 for display; for ASCII,
|
||||
* all bytes with the 8th bit set should be mapped to some "this is
|
||||
* not a valid character" glyph, as ENC_ASCII should mean "this is
|
||||
* ASCII, not some extended variant thereof". Perhaps we should also
|
||||
* map control characters to the Unicode glyphs showing the name of
|
||||
* the control character in small caps, diagonally. (Unfortunately,
|
||||
* those only exist for C0, not C1.)
|
||||
* We also don't process UTF-16 or UCS-2 differently - we don't
|
||||
* handle surrogate pairs, and don't handle 2-byte values that
|
||||
* aren't valid in UTF-16 or UCS-2 strings.
|
||||
*
|
||||
* For display, perhaps we should also map control characters to the
|
||||
* Unicode glyphs showing the name of the control character in small
|
||||
* caps, diagonally. (Unfortunately, those only exist for C0, not C1.)
|
||||
*/
|
||||
#define ENC_CHARENCODING_MASK 0x7FFFFFFE /* mask out byte-order bits */
|
||||
#define ENC_UTF_8 0x00000000
|
||||
#define ENC_ASCII 0x00000000
|
||||
#define ENC_EBCDIC 0x0EBCD1C0
|
||||
#define ENC_ASCII (0 << 1) /* shift up to avoid low-order bit */
|
||||
#define ENC_UTF_8 (1 << 1)
|
||||
#define ENC_UTF_16 (2 << 1)
|
||||
#define ENC_UCS_2 (3 << 1)
|
||||
#define ENC_EBCDIC (4 << 1)
|
||||
|
||||
/*
|
||||
* TODO:
|
||||
*
|
||||
* These could probably be used by existing code:
|
||||
*
|
||||
* ENC_UTF_16 - UTF-16
|
||||
* ENC_UCS_4 - UCS-4
|
||||
* ENC_UCS_2 - UCS-2 (not the same as UTF-16!)
|
||||
* ENC_ISO_8859_1 - ISO 8859/1
|
||||
* ENC_ISO_8859_8 - ISO 8859/8
|
||||
* - "IBM MS DBCS"
|
||||
|
|
|
@ -2329,6 +2329,7 @@ tvb_get_string(tvbuff_t *tvb, const gint offset, const gint length)
|
|||
|
||||
/*
|
||||
* Unicode (UTF-16) version of tvb_get_string()
|
||||
* XXX - this is UCS-2, not UTF-16, as it doesn't handle surrogate pairs
|
||||
*
|
||||
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
|
||||
*
|
||||
|
@ -2400,12 +2401,74 @@ tvb_get_ephemeral_string_enc(tvbuff_t *tvb, const gint offset,
|
|||
tvb_ensure_bytes_exist(tvb, offset, length);
|
||||
|
||||
ptr = ensure_contiguous(tvb, offset, length);
|
||||
switch (encoding & ENC_CHARENCODING_MASK) {
|
||||
|
||||
case ENC_ASCII:
|
||||
default:
|
||||
/*
|
||||
* For now, we treat bogus values as meaning
|
||||
* "ASCII" rather than reporting an error,
|
||||
* for the benefit of old dissectors written
|
||||
* when the last argument to proto_tree_add_item()
|
||||
* was a gboolean for the byte order, not an
|
||||
* encoding value, and passed non-zero values
|
||||
* other than TRUE to mean "little-endian".
|
||||
*
|
||||
* XXX - should map all octets with the 8th bit
|
||||
* not set to a "substitute" UTF-8 character.
|
||||
*/
|
||||
strbuf = ep_alloc(length + 1);
|
||||
if (length != 0) {
|
||||
memcpy(strbuf, ptr, length);
|
||||
}
|
||||
break;
|
||||
|
||||
case ENC_UTF_8:
|
||||
/*
|
||||
* XXX - should map all invalid UTF-8 sequences
|
||||
* to a "substitute" UTF-8 character.
|
||||
*/
|
||||
strbuf = ep_alloc(length + 1);
|
||||
if (length != 0) {
|
||||
memcpy(strbuf, ptr, length);
|
||||
}
|
||||
break;
|
||||
|
||||
case ENC_UTF_16:
|
||||
/*
|
||||
* XXX - needs to handle surrogate pairs and to map
|
||||
* invalid characters and sequences to a "substitute"
|
||||
* UTF-8 character.
|
||||
*/
|
||||
strbuf = tvb_get_ephemeral_unicode_string(tvb, offset, length,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
break;
|
||||
|
||||
case ENC_UCS_2:
|
||||
/*
|
||||
* XXX - needs to map values that are not valid UCS-2
|
||||
* characters (such as, I think, values used as the
|
||||
* components of a UTF-16 surrogate pair) to a
|
||||
* "substitute" UTF-8 character.
|
||||
*/
|
||||
strbuf = tvb_get_ephemeral_unicode_string(tvb, offset, length,
|
||||
encoding & ENC_LITTLE_ENDIAN);
|
||||
break;
|
||||
|
||||
case ENC_EBCDIC:
|
||||
/*
|
||||
* XXX - do the copy and conversion in one pass.
|
||||
*
|
||||
* XXX - multiple "dialects" of EBCDIC?
|
||||
*/
|
||||
strbuf = ep_alloc(length + 1);
|
||||
if (length != 0) {
|
||||
memcpy(strbuf, ptr, length);
|
||||
}
|
||||
if ((encoding & ENC_CHARENCODING_MASK) == ENC_EBCDIC)
|
||||
EBCDIC_to_ASCII(strbuf, length);
|
||||
break;
|
||||
}
|
||||
|
||||
strbuf[length] = '\0';
|
||||
return strbuf;
|
||||
}
|
||||
|
@ -2418,6 +2481,7 @@ tvb_get_ephemeral_string(tvbuff_t *tvb, const gint offset, const gint length)
|
|||
|
||||
/*
|
||||
* Unicode (UTF-16) version of tvb_get_ephemeral_string()
|
||||
* XXX - this is UCS-2, not UTF-16, as it doesn't handle surrogate pairs
|
||||
*
|
||||
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue