Fix the offset constant in SURROGATE_VALUE(), and add rather than OR it.

Expand a bunch of comments, discussing what various routines do and
should perhaps do.

Pull the core of tvb_get_ucs_2_string()/tvb_get_ucs_2_stringz() and
tvb_get_ucs_4_string()/tvb_get_ucs_4_stringz() into common routines, as
we did for tvb_get_utf_16_string()/tvb_get_utf_16_stringz().

svn path=/trunk/; revision=54374
This commit is contained in:
Guy Harris 2013-12-23 01:25:20 +00:00
parent 0ab7d560f3
commit 8a5d226894
1 changed files with 63 additions and 55 deletions

View File

@ -1897,6 +1897,13 @@ tvb_get_string_8859_1(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
return (guint8 *) wmem_strbuf_get_str(str);
}
/*
* Given a string encoded using octet per character, with octets with
* the high-order bit clear being ASCII, and a translation table that
* maps values for other octets to 2-byte Unicode Basic Multilingual
* Plane characters (including REPLACEMENT CHARACTER), return a UTF-8
* string with the same characters.
*/
static guint8 *
tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length, const gunichar2 table[0x80])
{
@ -1933,18 +1940,18 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* XXX - should map lead and trail surrogate values to a "substitute" UTF-8
* character?
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
static gchar *
tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
static wmem_strbuf_t *
tvb_extract_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
gunichar2 uchar;
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i + 1 < length; i += 2) {
@ -1960,6 +1967,16 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* XXX - if i < length, this means we were handed an odd
* number of bytes, so we're not a valid UCS-2 string.
*/
return strbuf;
}
static gchar *
tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
wmem_strbuf_t *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length);
strbuf = tvb_extract_ucs_2_string(scope, tvb, offset, length, encoding);
return (gchar*)wmem_strbuf_get_str(strbuf);
}
@ -1976,7 +1993,10 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* XXX - needs to map surrogate errors to a "substitute" UTF-8 character.
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
#define IS_LEAD_SURROGATE(uchar2) \
@ -1984,7 +2004,7 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
#define IS_TRAIL_SURROGATE(uchar2) \
((uchar2) >= 0xdc00 && (uchar2) < 0xe000)
#define SURROGATE_VALUE(lead, trail) \
(((((lead) - 0xd800) << 10) + ((trail) - 0xdc00)) | 0x10000)
(((((lead) - 0xd800) << 10) + ((trail) - 0xdc00)) + 0x100000)
static wmem_strbuf_t *
tvb_extract_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint size, const guint encoding)
@ -2086,18 +2106,19 @@ tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* XXX - should map lead and trail surrogate values, and code points beyond
* the maximum Unicode character, to a "substitute" UTF-8 character?
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if the number of bytes isn't a multiple of 4, should put a
* REPLACEMENT CHARACTER at the end.
*/
static gchar *
tvb_get_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
static wmem_strbuf_t *
tvb_extract_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
gunichar uchar;
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i + 3 < length; i += 4) {
@ -2114,6 +2135,16 @@ tvb_get_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* of bytes that's not a multiple of 4, so we're not a valid
* UCS-4 string.
*/
return strbuf;
}
static gchar *
tvb_get_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
wmem_strbuf_t *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length);
strbuf = tvb_extract_ucs_4_string(scope, tvb, offset, length, encoding);
return (gchar*)wmem_strbuf_get_str(strbuf);
}
@ -2152,15 +2183,19 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* other than TRUE to mean "little-endian".
*
* XXX - should map all octets with the 8th bit
* not set to a "substitute" UTF-8 character.
* set REPLACEMENT CHARACTERs.
*/
strbuf = tvb_get_string(scope, tvb, offset, length);
break;
case ENC_UTF_8:
/*
* XXX - should map all invalid UTF-8 sequences
* to a "substitute" UTF-8 character.
* XXX - should map lead and trail surrogate value code
* points to a "substitute" UTF-8 character?
* XXX - should map code points > 10FFFF to REPLACEMENT
* CHARACTERs.
* XXX - should map invalid UTF-8 sequences to
* REPLACEMENT CHARACTERs.
*/
strbuf = tvb_get_string(scope, tvb, offset, length);
break;
@ -2347,37 +2382,23 @@ tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp)
* Version of tvb_get_stringz() that handles the Basic Multilingual Plane
* (plane 0) of Unicode, with each code point encoded in 16 bits.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*
* XXX - needs to map values that are not valid UCS-2 characters (such as,
* I think, values used as the components of a UTF-16 surrogate pair) to a
* "substitute" UTF-8 character.
* Returns an allocated UTF-8 string and updates lengthp pointer with
* length of string (in bytes), including the terminating (2-byte) NUL.
*/
static gchar *
tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gunichar2 uchar;
gint size; /* Number of bytes in string */
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
size = tvb_unicode_strsize(tvb, offset);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i < size; i += 2) {
if (encoding == ENC_BIG_ENDIAN)
uchar = tvb_get_ntohs(tvb, offset + i);
else
uchar = tvb_get_letohs(tvb, offset + i);
wmem_strbuf_append_unichar(strbuf, uchar);
}
strbuf = tvb_extract_ucs_2_string(scope, tvb, offset, size, encoding);
if (lengthp)
*lengthp = i; /* Number of *bytes* processed */
*lengthp = size;
return (gchar*)wmem_strbuf_get_str(strbuf);
}
@ -2401,22 +2422,18 @@ tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset
strbuf = tvb_extract_utf_16_string(scope, tvb, offset, size, encoding);
if (lengthp)
*lengthp = size; /* Number of *bytes* processed */
*lengthp = size;
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Version of tvb_get_stringz() that handles Unicode, with each code point
* encoded in 32 bits.
* Version of tvb_get_stringz() that handles UCS-4.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*
* XXX - needs to map values that are not valid Unicode characters (such as,
* I think, values used as the components of a UTF-16 surrogate pair) to a
* "substitute" UTF-8 character.
* Returns an allocated UTF-8 string and updates lengthp pointer with
* length of string (in bytes), including the terminating (4-byte) NUL.
*/
static gchar *
tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
@ -2435,16 +2452,7 @@ tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
size += 4;
} while(uchar != 0);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i < size; i += 4) {
if (encoding == ENC_BIG_ENDIAN)
uchar = tvb_get_ntohl(tvb, offset + i);
else
uchar = tvb_get_letohl(tvb, offset + i);
wmem_strbuf_append_unichar(strbuf, uchar);
}
strbuf = tvb_extract_ucs_4_string(scope, tvb, offset, size, encoding);
if (lengthp)
*lengthp = i; /* Number of *bytes* processed */