Don't do the byte-with-8th-bit-set-to-REPLACEMENT-CHARACTER mapping for

UTF-8 strings.

Add that mapping for null-terminated ASCII strings.

Factor out some common parts of comments about string routines, and
clean up some other comments.

svn path=/trunk/; revision=54868
This commit is contained in:
Guy Harris 2014-01-21 01:23:29 +00:00
parent 6517e3ba4b
commit 9cdf8dd5f5
2 changed files with 137 additions and 102 deletions

View File

@ -1850,23 +1850,29 @@ tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset, const gint size)
#define UNREPL 0x00FFFD
/*
* Given a tvbuff, an offset, and a length, allocate a buffer big enough
* to hold a string of length characters plus a trailing '\0'. Copy length
* characters, starting at offset, from the tvbuff into the buffer and return
* a pointer to the buffer.
* Characters with the highest bit set will be converted to the Unicode
* Replacement Character. The resulting buffer contains a valid UTF-8
* string of length+1 characters (not necessarily length+1 bytes since
* the replacement char is two bytes long).
* All string functions below take a scope as an argument.
*
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitly free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
* Throws an exception if the tvbuff ends before the string does.
*
* All functions throw an exception if the tvbuff ends before the string
* does.
*/
guint8 *
tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
/*
* Given a tvbuff, an offset, and a length, treat the string of bytes
* referred to by them as an ASCII string, with all bytes with the
* high-order bit set being invalid, and return a pointer to a
* UTF-8 string.
*
* Octets with the highest bit set will be converted to the Unicode
* REPLACEMENT CHARACTER.
*/
static guint8 *
tvb_get_ascii_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
{
wmem_strbuf_t *str;
@ -1879,9 +1885,8 @@ tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
if (ch < 0x80)
wmem_strbuf_append_c(str, ch);
else {
else
wmem_strbuf_append_unichar(str, UNREPL);
}
offset++;
length--;
}
@ -1892,6 +1897,31 @@ tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
return (guint8 *) wmem_strbuf_get_str(str);
}
/*
* Given a tvbuff, an offset, and a length, treat the string of bytes
* referred to by them as a UTF-8 string, and return a pointer to that
* string.
*
* XXX - should map invalid UTF-8 sequences to UNREPL.
*/
static guint8 *
tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length)
{
guint8 *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
strbuf = (guint8 *)wmem_alloc(scope, length + 1);
tvb_memcpy(tvb, strbuf, offset, length);
strbuf[length] = '\0';
return strbuf;
}
/*
* Given a tvbuff, an offset, and a length, treat the string of bytes
* referred to by them as an ISO 8859/1 string, with all bytes with the
* high-order bit set being invalid, and return a pointer to a UTF-8
* string.
*/
static guint8 *
tvb_get_string_8859_1(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
{
@ -1922,11 +1952,13 @@ tvb_get_string_8859_1(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
}
/*
* Given a string encoded using octet per character, with octets with
* the high-order bit clear being ASCII, and a translation table that
* maps values for other octets to 2-byte Unicode Basic Multilingual
* Plane characters (including REPLACEMENT CHARACTER), return a UTF-8
* string with the same characters.
* Given a tvbuff, an offset, and a length, and a translation table,
* treat the string of bytes referred to by them as a string encoded
* using one octet per character, with octets with the high-order bit
* clear being ASCII and octets with the high-order bit set being
* mapped by the translation table to 2-byte Unicode Basic Multilingual
* Plane characters (including REPLACEMENT CHARACTER), and return a
* pointer to a UTF-8 string.
*/
static guint8 *
tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length, const gunichar2 table[0x80])
@ -1951,18 +1983,14 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin
}
/*
* Given a UCS-2 encoded string containing characters from the
* Basic Multilingual Plane (plane 0) of Unicode, return a UTF-8
* string with the same characters.
* Given a tvbuff, and offset, and a length, treat the string of bytes
* referred to by them as a UCS-2 encoded string containing characters
* from the Basic Multilingual Plane (plane 0) of Unicode, return a
* pointer to a UTF-8 string.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitly free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
* Specify length in bytes.
*
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
@ -2006,24 +2034,19 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
}
/*
* Given a UTF-16 encoded Unicode string, return a UTF-8 string with the
* same characters.
* Given a tvbuff, and offset, and a length, treat the string of bytes
* referred to by them as a UTF-16 encoded string, return a pointer to
* a UTF-8 string.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitly free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
* Specify length in bytes.
*
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
static wmem_strbuf_t *
tvb_extract_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint size, const guint encoding)
{
@ -2113,18 +2136,14 @@ tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
}
/*
* Given a UCS-4-encoded Unicode string, return a UTF-8 string with the
* same characters.
* Given a tvbuff, and offset, and a length, treat the string of bytes
* referred to by them as a UCS-4 encoded string, return a pointer to
* a UTF-8 string.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitly free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
@ -2347,13 +2366,6 @@ tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope, tvbuff_t *tvb,
* at that offset, plus a trailing '\0', copy into the buffer the
* string as converted from the appropriate encoding to UTF-8, and
* return a pointer to the string.
*
* Throws an exception if the tvbuff ends before the string does.
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitly free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*/
guint8 *
tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
@ -2375,7 +2387,7 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* encoding value, and passed non-zero values
* other than TRUE to mean "little-endian".
*/
strbuf = tvb_get_string(scope, tvb, offset, length);
strbuf = tvb_get_ascii_string(scope, tvb, offset, length);
break;
case ENC_UTF_8:
@ -2385,7 +2397,7 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* XXX - should map code points > 10FFFF to REPLACEMENT
* CHARACTERs.
*/
strbuf = tvb_get_string(scope, tvb, offset, length);
strbuf = tvb_get_utf_8_string(scope, tvb, offset, length);
break;
case ENC_UTF_16:
@ -2500,20 +2512,54 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
}
/*
* Given a tvbuff and an offset, with the offset assumed to refer to
* a null-terminated string, find the length of that string (and throw
* an exception if the tvbuff ends before we find the null), allocate
* a buffer big enough to hold the string, copy the string into it,
* and return a pointer to the string. Also return the length of the
* string (including the terminating null) through a pointer.
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitly free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
* Get an ASCII string; this should not be used in new code.
*/
guint8 *
tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp)
tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
const gint length)
{
return tvb_get_ascii_string(scope, tvb, offset, length);
}
/*
* These routines are like the above routines, except that they handle
* null-terminated strings. They find the length of that string (and
* throw an exception if the tvbuff ends before we find the null), and
* also return through a pointer the length of the string, in bytes,
* including the terminating null (the terminating null being 2 bytes
* for UCS-2 and UTF-16, 4 bytes for UCS-4, and 1 byte for other
* encodings).
*/
static guint8 *
tvb_get_ascii_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint *lengthp)
{
guint size, i;
wmem_strbuf_t *str;
str = wmem_strbuf_new(scope, "");
size = tvb_strsize(tvb, offset);
for (i = 0; i < size; i++) {
guint8 ch = tvb_get_guint8(tvb, offset);
if (ch < 0x80)
wmem_strbuf_append_c(str, ch);
else
wmem_strbuf_append_unichar(str, UNREPL);
offset++;
}
/* No need to append '\0' - we processed the NUL in the loop above. */
if (lengthp)
*lengthp = size;
/* XXX, discarding constiness, should we have some function which "take-over" strbuf->str
(like when strbuf is no longer needed) */
return (guint8 *) wmem_strbuf_get_str(str);
}
static guint8 *
tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp)
{
guint size;
guint8 *strptr;
@ -2574,15 +2620,6 @@ tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp)
return strptr;
}
/*
* Version of tvb_get_stringz() that handles the Basic Multilingual Plane
* (plane 0) of Unicode, with each code point encoded in 16 bits.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Returns an allocated UTF-8 string and updates lengthp pointer with
* length of string (in bytes), including the terminating (2-byte) NUL.
*/
static gchar *
tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
@ -2600,14 +2637,6 @@ tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Version of tvb_get_stringz() that handles UTF-16.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Returns an allocated UTF-8 string and updates lengthp pointer with
* length of string (in bytes), including the terminating (2-byte) NUL.
*/
static gchar *
tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
@ -2625,14 +2654,6 @@ tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Version of tvb_get_stringz() that handles UCS-4.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Returns an allocated UTF-8 string and updates lengthp pointer with
* length of string (in bytes), including the terminating (4-byte) NUL.
*/
static gchar *
tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
@ -2676,19 +2697,18 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
* was a gboolean for the byte order, not an
* encoding value, and passed non-zero values
* other than TRUE to mean "little-endian".
*
* XXX - should map all octets with the 8th bit
* not set to a "substitute" UTF-8 character.
*/
strptr = tvb_get_stringz(scope, tvb, offset, lengthp);
strptr = tvb_get_ascii_stringz(scope, tvb, offset, lengthp);
break;
case ENC_UTF_8:
/*
* XXX - should map all invalid UTF-8 sequences
* to a "substitute" UTF-8 character.
* XXX - should map code points > 10FFFF to REPLACEMENT
* CHARACTERs.
*/
strptr = tvb_get_stringz(scope, tvb, offset, lengthp);
strptr = tvb_get_utf_8_stringz(scope, tvb, offset, lengthp);
break;
case ENC_UTF_16:
@ -2797,6 +2817,16 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
return strptr;
}
/*
* Get an ASCII string; this should not be used in new code.
*/
guint8 *
tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
gint *lengthp)
{
return tvb_get_ascii_stringz(scope, tvb, offset, lengthp);
}
/* Looks for a stringz (NUL-terminated string) in tvbuff and copies
* no more than bufsize number of bytes, including terminating NUL, to buffer.
* Returns length of string (not including terminating NUL), or -1 if the string was

View File

@ -485,11 +485,13 @@ extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset,
*
* Throws an exception if the tvbuff ends before the string does.
*
* tvb_get_string() handles 7bit ASCII strings, 8bit characters are
* converted into the Unicode Replacement Character.
* tvb_get_string() handles 7-bit ASCII strings, with characters
* with the 8th bit set are converted to the
* Unicode REPLACEMENT CHARACTER.
*
* tvb_get_string_enc() takes a string encoding as well, and converts to UTF-8
* from the encoding.
* from the encoding, possibly mapping some characters
* to the REPLACEMENT CHARACTER.
*
* If scope is set to NULL it is the user's responsibility to g_free()
* the memory allocated by tvb_memdup(). Otherwise memory is
@ -522,10 +524,13 @@ WS_DLL_PUBLIC gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope,
* and return a pointer to the string. Also return the length of the
* string (including the terminating null) through a pointer.
*
* tvb_get_stringz() returns a string
* tvb_get_stringz() handles 7-bit ASCII strings, with characters
* with the 8th bit set are converted to the
* Unicode REPLACEMENT CHARACTER.
*
* tvb_get_stringz_enc() takes a string encoding as well, and converts to
* UTF-8 from the encoding.
* tvb_get_stringz_enc() takes a string encoding as well, and converts to UTF-8
* from the encoding, possibly mapping some characters
* to the REPLACEMENT CHARACTER.
*
* tvb_get_const_stringz() returns a constant (unmodifiable) string that does
* not need to be freed, instead it will automatically be