Add ENC_UTF_16 and ENC_UCS_2. Note that UTF-16 and UCS-2 are not the

same, and that the routines to get "Unicode" strings are really doing UCS-2 (and not doing anything about code values that aren't valid in UCS-2 strings). Have tvb_get_ephemeral_string_enc() separate cases for ASCII and UTF-8, even though they're *currently* treated the same. For FT_UINT_STRING, treat an encoding value of TRUE as meaning "little-endian ASCII"; pass all other encodings through to tvb_get_ephemeral_string_enc(). svn path=/trunk/; revision=42592
2012-05-11 23:55:54 +00:00 · 2012-05-11 23:55:54 +00:00 · 7a87d7b6a3
parent bb524c6b64
commit 7a87d7b6a3
3 changed files with 103 additions and 41 deletions
--- a/epan/proto.c
+++ b/epan/proto.c
@ -1484,20 +1484,16 @@ proto_tree_new_item(field_info *new_fi, proto_tree *tree,
 			 * NOTE: to support code written when
 			 * proto_tree_add_item() took a gboolean as its
 			 * last argument, with FALSE meaning "big-endian"
-			 * and TRUE meaning "little-endian", we any
-			 * non-zero value of "encoding", except for
-			 * ENC_EBCDIC|ENC_BIG_ENDIAN and
-			 * ENC_EBCDIC|ENC_LITTLE_ENDIAN  as meaning
-			 * "little-endian UTF-8".
+			 * and TRUE meaning "little-endian", if the
+			 * encoding value is TRUE, treat that as
+			 * ASCII with a little-endian length.
 			 *
-			 * At some point in the future, we might
-			 * support more character encodings in the
-			 * encoding value as well.
+			 * This won't work for code that passes
+			 * arbitrary non-zero values; that code
+			 * will need to be fixed.
 			 */
-			if (encoding != 0 &&
-			    encoding != (ENC_EBCDIC|ENC_BIG_ENDIAN) &&
-			    encoding != (ENC_EBCDIC|ENC_LITTLE_ENDIAN))
-				encoding = ENC_UTF_8|ENC_LITTLE_ENDIAN;
+			if (encoding == TRUE)
+				encoding = ENC_ASCII|ENC_LITTLE_ENDIAN;
 			n = get_uint_value(tvb, start, length, encoding);
 			proto_tree_set_string_tvb(new_fi, tvb, start + length, n,
 			    encoding);
--- a/epan/proto.h
+++ b/epan/proto.h
@ -245,41 +245,43 @@ typedef struct _protocol protocol_t;
 * was with FT_UINT_STRINGs, where we had FALSE for the string length
 * being big-endian and TRUE for it being little-endian.
 *
- * This is a quick and dirty hack for bug 6084, which doesn't require
- * support for multiple character encodings in FT_UINT_STRING.  We
- * introduce ENC_UTF_8 and ENC_EBCDIC, with ENC_UTF_8 being 0 and
- * ENC_EBCDIC being the unlikely value 0x0EBCD000, and treat all values
- * other than ENC_EBCDIC as UTF-8.  That way, no matter how a dissector
- * not converted to use ENC_ values calculates the last argument to
- * proto_tree_add_item(), it's unlikely to get EBCDIC.
+ * We now have encoding values for the character encoding.  The encoding
+ * values are encoded in all but the top bit (which is the byte-order
+ * bit, required for FT_UINT_STRING and for UCS-2 and UTF-16 strings)
+ * and the bottom bit (which we ignore for now so that programs that
+ * pass TRUE for the encoding just do ASCII).
 *
- * The value for ENC_EBCDIC is subject to change in a future release (or
- * to replacement with multiple values for different flavors of EBCDIC).
+ * We don't yet process ASCII and UTF-8 differently.  Ultimately, for
+ * ASCII, all bytes with the 8th bit set should be mapped to some "this
+ * is not a valid character" code point, as ENC_ASCII should mean "this
+ * is ASCII, not some extended variant thereof".  We should also map
+ * 0x00 to that as well - null-terminated and null-padded strings
+ * never have NULs in them, but counted strings might.  (Either that,
+ * or the values for strings should be counted, not null-terminated.)
+ * For UTF-8, invalid UTF-8 sequences should be mapped to the same
+ * code point.
 *
- * We currently add some additional encodings, for various ASCII-based
- * encodings, but use the same value as ENC_UTF_8, for now, so that we
- * can mark the appropriate encoding.  Ultimately, we should handle
- * those encodings by mapping them to UTF-8 for display; for ASCII,
- * all bytes with the 8th bit set should be mapped to some "this is
- * not a valid character" glyph, as ENC_ASCII should mean "this is
- * ASCII, not some extended variant thereof".  Perhaps we should also
- * map control characters to the Unicode glyphs showing the name of
- * the control character in small caps, diagonally.  (Unfortunately,
- * those only exist for C0, not C1.)
+ * We also don't process UTF-16 or UCS-2 differently - we don't
+ * handle surrogate pairs, and don't handle 2-byte values that
+ * aren't valid in UTF-16 or UCS-2 strings.
+ *
+ * For display, perhaps we should also map control characters to the
+ * Unicode glyphs showing the name of the control character in small
+ * caps, diagonally.  (Unfortunately, those only exist for C0, not C1.)
 */
 #define ENC_CHARENCODING_MASK	0x7FFFFFFE	/* mask out byte-order bits */
-#define ENC_UTF_8		0x00000000
-#define ENC_ASCII		0x00000000
-#define ENC_EBCDIC		0x0EBCD1C0
+#define ENC_ASCII		(0 << 1)	/* shift up to avoid low-order bit */
+#define ENC_UTF_8		(1 << 1)
+#define ENC_UTF_16		(2 << 1)
+#define ENC_UCS_2		(3 << 1)
+#define ENC_EBCDIC		(4 << 1)

 /*
 * TODO:
 *
 * These could probably be used by existing code:
 *
- *	ENC_UTF_16 - UTF-16
 *	ENC_UCS_4 - UCS-4
- *	ENC_UCS_2 - UCS-2 (not the same as UTF-16!)
 *	ENC_ISO_8859_1 - ISO 8859/1
 *	ENC_ISO_8859_8 - ISO 8859/8
 *	 - "IBM MS DBCS"
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@ -2329,6 +2329,7 @@ tvb_get_string(tvbuff_t *tvb, const gint offset, const gint length)

 /*
 * Unicode (UTF-16) version of tvb_get_string()
+ * XXX - this is UCS-2, not UTF-16, as it doesn't handle surrogate pairs
 *
 * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
 *
@ -2400,12 +2401,74 @@ tvb_get_ephemeral_string_enc(tvbuff_t *tvb, const gint offset,
 	tvb_ensure_bytes_exist(tvb, offset, length);

 	ptr    = ensure_contiguous(tvb, offset, length);
+	switch (encoding & ENC_CHARENCODING_MASK) {
+
+	case ENC_ASCII:
+	default:
+		/*
+		 * For now, we treat bogus values as meaning
+		 * "ASCII" rather than reporting an error,
+		 * for the benefit of old dissectors written
+		 * when the last argument to proto_tree_add_item()
+		 * was a gboolean for the byte order, not an
+		 * encoding value, and passed non-zero values
+		 * other than TRUE to mean "little-endian".
+		 *
+		 * XXX - should map all octets with the 8th bit
+		 * not set to a "substitute" UTF-8 character.
+		 */
+		strbuf = ep_alloc(length + 1);
+		if (length != 0) {
+			memcpy(strbuf, ptr, length);
+		}
+		break;
+
+	case ENC_UTF_8:
+		/*
+		 * XXX - should map all invalid UTF-8 sequences
+		 * to a "substitute" UTF-8 character.
+		 */
+		strbuf = ep_alloc(length + 1);
+		if (length != 0) {
+			memcpy(strbuf, ptr, length);
+		}
+		break;
+
+	case ENC_UTF_16:
+		/*
+		 * XXX - needs to handle surrogate pairs and to map
+		 * invalid characters and sequences to a "substitute"
+		 * UTF-8 character.
+		 */
+		strbuf = tvb_get_ephemeral_unicode_string(tvb, offset, length,
+		    encoding & ENC_LITTLE_ENDIAN);
+		break;
+
+	case ENC_UCS_2:
+		/*
+		 * XXX - needs to map values that are not valid UCS-2
+		 * characters (such as, I think, values used as the
+		 * components of a UTF-16 surrogate pair) to a
+		 * "substitute" UTF-8 character.
+		 */
+		strbuf = tvb_get_ephemeral_unicode_string(tvb, offset, length,
+		    encoding & ENC_LITTLE_ENDIAN);
+		break;
+
+	case ENC_EBCDIC:
+		/*
+		 * XXX - do the copy and conversion in one pass.
+		 *
+		 * XXX - multiple "dialects" of EBCDIC?
+		 */
 		strbuf = ep_alloc(length + 1);
 		if (length != 0) {
 			memcpy(strbuf, ptr, length);
 		}
-	if ((encoding & ENC_CHARENCODING_MASK) == ENC_EBCDIC)
 		EBCDIC_to_ASCII(strbuf, length);
+		break;
+	}
+
 	strbuf[length] = '\0';
 	return strbuf;
 }
@ -2418,6 +2481,7 @@ tvb_get_ephemeral_string(tvbuff_t *tvb, const gint offset, const gint length)

 /*
 * Unicode (UTF-16) version of tvb_get_ephemeral_string()
+ * XXX - this is UCS-2, not UTF-16, as it doesn't handle surrogate pairs
 *
 * Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
 *