Get rid of tvb_get_unicode_string() and tvb_get_unicode_stringz();

instead, have static routines to get UCS-2 (no surrogate pairs) and
UTF-16 (with surrogate pairs) strings, with the routines to handle
UTF-16 actually handling surrogate pairs.

Update some out-of-date comments while we're at it.

svn path=/trunk/; revision=54318
This commit is contained in:
Guy Harris 2013-12-21 01:42:41 +00:00
parent 296a286ed9
commit 92f177ec97
2 changed files with 212 additions and 101 deletions

View File

@ -1920,8 +1920,9 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin
}
/*
* Unicode (UTF-16) version of tvb_get_string()
* XXX - this is UCS-2, not UTF-16, as it doesn't handle surrogate pairs
* Given a UCS-2 encoded string containing characters from the
* Basic Multilingual Plane (plane 0) of Unicode, return a UTF-8
* string with the same characters.
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
@ -1932,10 +1933,11 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* Returns an UTF-8 string
* XXX - should map lead and trail surrogate values to a "substitute" UTF-8
* character?
*/
gchar *
tvb_get_unicode_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
static gchar *
tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
gunichar2 uchar;
gint i; /* Byte counter for tvbuff */
@ -1961,6 +1963,116 @@ tvb_get_unicode_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Given a UTF-16 encoded Unicode string, return a UTF-8 string with the
* same characters.
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitely free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* XXX - needs to map surrogate errors to a "substitute" UTF-8 character.
*/
#define IS_LEAD_SURROGATE(uchar2) \
((uchar2) >= 0xd800 && (uchar2) < 0xdc00)
#define IS_TRAIL_SURROGATE(uchar2) \
((uchar2) >= 0xdc00 && (uchar2) < 0xe000)
#define SURROGATE_VALUE(lead, trail) \
(((((lead) - 0xd800) << 10) + ((trail) - 0xdc00)) | 0x10000)
static wmem_strbuf_t *
tvb_extract_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint size, const guint encoding)
{
wmem_strbuf_t *strbuf;
gunichar2 uchar2, lead_surrogate;
gunichar uchar;
gint i; /* Byte counter for tvbuff */
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i + 1 < size; i += 2) {
if (encoding == ENC_BIG_ENDIAN)
uchar2 = tvb_get_ntohs(tvb, offset + i);
else
uchar2 = tvb_get_letohs(tvb, offset + i);
if (IS_LEAD_SURROGATE(uchar2)) {
/*
* Lead surrogate. Must be followed by
* a trail surrogate.
*/
i += 2;
if (i + 1 >= size) {
/*
* Oops, string ends with a lead surrogate.
* Ignore this for now.
* XXX - insert "substitute" character?
* Report the error in some other
* fashion?
*/
break;
}
lead_surrogate = uchar2;
if (encoding == ENC_BIG_ENDIAN)
uchar2 = tvb_get_ntohs(tvb, offset + i);
else
uchar2 = tvb_get_letohs(tvb, offset + i);
if (IS_TRAIL_SURROGATE(uchar2)) {
/* Trail surrogate. */
uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
wmem_strbuf_append_unichar(strbuf, uchar);
} else {
/*
* Not a trail surrogate.
* Ignore the entire pair.
* XXX - insert "substitute" character?
* Report the error in some other
* fashion?
*/
;
}
} else {
if (IS_TRAIL_SURROGATE(uchar2)) {
/*
* Trail surrogate without a preceding
* lead surrogate. Ignore it.
* XXX - insert "substitute" character?
* Report the error in some other
* fashion?
*/
;
} else {
/*
* Non-surrogate; just append it.
*/
wmem_strbuf_append_unichar(strbuf, uchar2);
}
}
}
/*
* XXX - if i < length, this means we were handed an odd
* number of bytes, so we're not a valid UTF-16 string.
*/
return strbuf;
}
static gchar *
tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
wmem_strbuf_t *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length);
strbuf = tvb_extract_utf_16_string(scope, tvb, offset, length, encoding);
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Given a tvbuff, an offset, a length, and an encoding, allocate a
* buffer big enough to hold a non-null-terminated string of that length
@ -2035,23 +2147,12 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
break;
case ENC_UTF_16:
/*
* XXX - needs to handle surrogate pairs and to map
* invalid characters and sequences to a "substitute"
* UTF-8 character.
*/
strbuf = tvb_get_unicode_string(scope, tvb, offset, length,
strbuf = tvb_get_utf_16_string(scope, tvb, offset, length,
encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_UCS_2:
/*
* XXX - needs to map values that are not valid UCS-2
* characters (such as, I think, values used as the
* components of a UTF-16 surrogate pair) to a
* "substitute" UTF-8 character.
*/
strbuf = tvb_get_unicode_string(scope, tvb, offset, length,
strbuf = tvb_get_ucs_2_string(scope, tvb, offset, length,
encoding & ENC_LITTLE_ENDIAN);
break;
@ -2123,6 +2224,95 @@ tvb_get_stringz_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gi
return tvb_get_string_unichar2(scope, tvb, offset, size, table);
}
/*
* Given a tvbuff and an offset, with the offset assumed to refer to
* a null-terminated string, find the length of that string (and throw
* an exception if the tvbuff ends before we find the null), ensure that
* the TVB is flat, and return a pointer to the string (in the TVB).
* Also return the length of the string (including the terminating null)
* through a pointer.
*
* As long as we aren't using composite TVBs, this saves the cycles used
* (often unnecessariliy) in allocating a buffer and copying the string into
* it. (If we do start using composite TVBs, we may want to replace this
* function with the _ephemeral versoin.)
*/
const guint8 *
tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp)
{
guint size;
const guint8 *strptr;
size = tvb_strsize(tvb, offset);
strptr = ensure_contiguous(tvb, offset, size);
if (lengthp)
*lengthp = size;
return strptr;
}
/*
* Version of tvb_get_stringz() that handles the Basic Multilingual Plane
* (plane 0) of Unicode, with each code point encoded in 16 bits.
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*
* XXX - needs to map values that are not valid UCS-2 characters (such as,
* I think, values used as the components of a UTF-16 surrogate pair) to a
* "substitute" UTF-8 character.
*/
static gchar *
tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gunichar2 uchar;
gint size; /* Number of UTF-16 characters */
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
size = tvb_unicode_strsize(tvb, offset);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i < size; i += 2) {
if (encoding == ENC_BIG_ENDIAN)
uchar = tvb_get_ntohs(tvb, offset + i);
else
uchar = tvb_get_letohs(tvb, offset + i);
wmem_strbuf_append_unichar(strbuf, uchar);
}
if (lengthp)
*lengthp = i; /* Number of *bytes* processed */
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Version of tvb_get_stringz() that handles UTF-16.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Returns an allocated UTF-8 string and updates lengthp pointer with
* length of string (in bytes), including the terminating (2-byte) NUL.
*/
static gchar *
tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gint size;
wmem_strbuf_t *strbuf;
size = tvb_unicode_strsize(tvb, offset);
strbuf = tvb_extract_utf_16_string(scope, tvb, offset, size, encoding);
if (lengthp)
*lengthp = size; /* Number of *bytes* processed */
return (gchar*)wmem_strbuf_get_str(strbuf);
}
guint8 *
tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
@ -2182,23 +2372,12 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
break;
case ENC_UTF_16:
/*
* XXX - needs to handle surrogate pairs and to map
* invalid characters and sequences to a "substitute"
* UTF-8 character.
*/
strptr = tvb_get_unicode_stringz(scope, tvb, offset, lengthp,
strptr = tvb_get_utf_16_stringz(scope, tvb, offset, lengthp,
encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_UCS_2:
/*
* XXX - needs to map values that are not valid UCS-2
* characters (such as, I think, values used as the
* components of a UTF-16 surrogate pair) to a
* "substitute" UTF-8 character.
*/
strptr = tvb_get_unicode_stringz(scope, tvb, offset, lengthp,
strptr = tvb_get_ucs_2_stringz(scope, tvb, offset, lengthp,
encoding & ENC_LITTLE_ENDIAN);
break;
@ -2220,66 +2399,6 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
return strptr;
}
/*
* Given a tvbuff and an offset, with the offset assumed to refer to
* a null-terminated string, find the length of that string (and throw
* an exception if the tvbuff ends before we find the null), ensure that
* the TVB is flat, and return a pointer to the string (in the TVB).
* Also return the length of the string (including the terminating null)
* through a pointer.
*
* As long as we aren't using composite TVBs, this saves the cycles used
* (often unnecessariliy) in allocating a buffer and copying the string into
* it. (If we do start using composite TVBs, we may want to replace this
* function with the _ephemeral versoin.)
*/
const guint8 *
tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp)
{
guint size;
const guint8 *strptr;
size = tvb_strsize(tvb, offset);
strptr = ensure_contiguous(tvb, offset, size);
if (lengthp)
*lengthp = size;
return strptr;
}
/*
* Unicode (UTF-16) version of tvb_get_stringz()
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*/
gchar *
tvb_get_unicode_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gunichar2 uchar;
gint size; /* Number of UTF-16 characters */
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
size = tvb_unicode_strsize(tvb, offset);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i < size; i += 2) {
if (encoding == ENC_BIG_ENDIAN)
uchar = tvb_get_ntohs(tvb, offset + i);
else
uchar = tvb_get_letohs(tvb, offset + i);
wmem_strbuf_append_unichar(strbuf, uchar);
}
if (lengthp)
*lengthp = i; /* Number of *bytes* processed */
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/* Looks for a stringz (NUL-terminated string) in tvbuff and copies
* no more than bufsize number of bytes, including terminating NUL, to buffer.
* Returns length of string (not including terminating NUL), or -1 if the string was

View File

@ -487,10 +487,8 @@ extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset,
*
* tvb_get_string() returns a string allocated.
*
* tvb_get_unicode_string() Unicode (UTF-16) version of above.
*
* tvb_get_string_enc() takes a string encoding as well, and converts to UTF-8
* from the encoding (only UTF-8 and EBCDIC supported).
* from the encoding.
*
* If scope is set to NULL it is the user's responsibility to g_free()
* the memory allocated by tvb_memdup(). Otherwise memory is
@ -498,8 +496,6 @@ extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset,
*/
WS_DLL_PUBLIC guint8 *tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb,
const gint offset, const gint length);
WS_DLL_PUBLIC gchar *tvb_get_unicode_string(wmem_allocator_t *scope,
tvbuff_t *tvb, const gint offset, gint length, const guint encoding);
WS_DLL_PUBLIC guint8 *tvb_get_string_enc(wmem_allocator_t *scope,
tvbuff_t *tvb, const gint offset, const gint length, const guint encoding);
@ -515,15 +511,13 @@ WS_DLL_PUBLIC guint8 *tvb_get_string_enc(wmem_allocator_t *scope,
* tvb_get_stringz() returns a string
*
* tvb_get_stringz_enc() takes a string encoding as well, and converts to
* UTF-8 from the encoding (only UTF-8 and EBCDIC supported)
* UTF-8 from the encoding.
*
* tvb_get_const_stringz() returns a constant (unmodifiable) string that does
* not need to be freed, instead it will automatically be
* freed once the next packet is dissected. It is slightly
* more efficient than the other routines.
*
* tvb_get_unicode_stringz() Unicode (UTF-16) version of above
*
* If scope is set to NULL it is the user's responsibility to g_free()
* the memory allocated by tvb_memdup(). Otherwise memory is
* automatically freed when the scope lifetime is reached.
@ -534,8 +528,6 @@ WS_DLL_PUBLIC guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope,
tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding);
WS_DLL_PUBLIC const guint8 *tvb_get_const_stringz(tvbuff_t *tvb,
const gint offset, gint *lengthp);
WS_DLL_PUBLIC gchar *tvb_get_unicode_stringz(wmem_allocator_t *scope,
tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding);
/** Looks for a stringz (NUL-terminated string) in tvbuff and copies
* no more than bufsize number of bytes, including terminating NUL, to buffer.