From 8d234a0d8c6c974a374e36a58cd7b3d699866464 Mon Sep 17 00:00:00 2001 From: Guy Harris Date: Sun, 23 Feb 2014 14:16:24 -0800 Subject: [PATCH] More tvbuff API deprecation, comment expansion, and documentation updates. Do with tvb_get_stringz() what was done with tvb_get_string(). Redo the comments for the string get routines to try to give more detail in a fashion that's a bit less hard to read. Warn, in comments, of the problems with using tvb_get_string()/tvb_get_stringz() (i.e., if your strings are non-ASCII, all bytes with the 8th bit set are going be replaced by the Unicode REPLACEMENT CHARACTER, and displayed as such). Warn, in a comment, of the problems with tvb_get_const_stringz() (i.e., it gives you raw bytes, rather than guaranteed-to-be-valid UTF-8). Update documentation and release notes appropriately. Change-Id: Ibd3efb92a203861f507ce71bc8d04d19d9d38a93 Reviewed-on: https://code.wireshark.org/review/327 Reviewed-by: Guy Harris --- doc/README.dissector | 54 ++++++++++------ docbook/release-notes.asciidoc | 11 ++-- epan/tvbuff.c | 10 --- epan/tvbuff.h | 113 +++++++++++++++++++++++---------- tools/checkAPIs.pl | 1 + 5 files changed, 123 insertions(+), 66 deletions(-) diff --git a/doc/README.dissector b/doc/README.dissector index 321f87288d..5f5eb789a5 100644 --- a/doc/README.dissector +++ b/doc/README.dissector @@ -256,9 +256,38 @@ data from the specified tvbuff, starting at the specified offset, and containing the specified length worth of characters. Reads data in the specified encoding and produces UTF-8 in the buffer. See below for a list of input encoding values. -guint8 *tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp); +The buffer is allocated by g_malloc() if scope is set to NULL (in that +case memory must be explicitely freed), or with the allocator lifetime +if scope is not NULL. + guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding); + +Returns a null-terminated buffer allocated from the specified scope, +containing data from the specified tvbuff, starting at the specified +offset, and containing all characters from the tvbuff up to and +including a terminating null character in the tvbuff. Reads data in the +specified encoding and produces UTF-8 in the buffer. See below for a +list of input encoding values. "*lengthp" will be set to the length of +the string, including the terminating null. + +The buffer is allocated by g_malloc() if scope is set to NULL (in that +case memory must be explicitely freed), or with the allocator lifetime +if scope is not NULL. + const guint8 *tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp); + +Returns a null-terminated const buffer containing data from the +specified tvbuff, starting at the specified offset, and containing all +bytes from the tvbuff up to and including a terminating null character +in the tvbuff. "*lengthp" will be set to the length of the string, +including the terminating null. + +You do not need to free() this buffer; it will happen automatically once +the next packet is dissected. This function is slightly more efficient +than the others because it does not allocate memory and copy the string, +but it does not do any mapping to UTF-8 or checks for valid octet +sequences. + gint tvb_get_nstringz(tvbuff_t *tvb, const gint offset, const guint bufsize, guint8* buffer); gint tvb_get_nstringz0(tvbuff_t *tvb, const gint offset, const guint bufsize, guint8* buffer); @@ -268,26 +297,15 @@ tvbuff up to and including a terminating null character in the tvbuff. "*lengthp" will be set to the length of the string, including the terminating null. -tvb_get_stringz() returns a buffer allocated by g_malloc() if scope is set -to NULL (in that case memory must be explicitely freed), or with the -allocator lifetime if scope is not NULL. - -tvb_get_stringz_enc() is a version of tvb_get_stringz() that takes a -string encoding as an argument. See below for a list of encoding values -for strings. - -tvb_get_const_stringz() returns a pointer to the (const) string in the tvbuff. -You do not need to free() this buffer, it will happen automatically once the -next packet is dissected. This function is slightly more efficient than the -others because it does not allocate memory and copy the string. - gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint bit_offset, gint no_of_chars); -tvb_get_ts_23_038_7bits_string() returns a string of a given number of characters -and encoded according to 3GPP TS 23.038 7 bits alphabet. The buffer allocated by -g_malloc() if scope is set to NULL (in that case memory must be explicitely freed), -or with the allocator lifetime if scope is not NULL. +tvb_get_ts_23_038_7bits_string() returns a string of a given number of +characters and encoded according to 3GPP TS 23.038 7 bits alphabet. + +The buffer is allocated by g_malloc() if scope is set to NULL (in that +case memory must be explicitely freed), or with the allocator lifetime +if scope is not NULL. Byte Array Accessors: diff --git a/docbook/release-notes.asciidoc b/docbook/release-notes.asciidoc index 1c0dbbd322..fd09d9903c 100644 --- a/docbook/release-notes.asciidoc +++ b/docbook/release-notes.asciidoc @@ -162,13 +162,14 @@ STANAG 5066 Data Transfer Sublayer The libwireshark API has undergone some major changes: -* A more flexible, modular memory manger (wmem) has been added. It was available - experimentally in 1.10 but is now mature and has mostly replaced the old emem - API (which is deprecated). +* A more flexible, modular memory manager (wmem) has been added. It was + available experimentally in 1.10 but is now mature and has mostly + replaced the old emem API (which is deprecated). * A new API for expert information has been added, replacing the old one. * The tvbuff API has been cleaned up: tvb_length has been renamed to - tvb_captured_length for clarity, and tvb_get_string has been deprecated in - favour of tvb_get_string_enc. + tvb_captured_length for clarity, and tvb_get_string and tvb_get_stringz + have been deprecated in favour of tvb_get_string_enc and + tvb_get_stringz_enc. == Getting Wireshark diff --git a/epan/tvbuff.c b/epan/tvbuff.c index 945ab95f9c..6458382497 100644 --- a/epan/tvbuff.c +++ b/epan/tvbuff.c @@ -2770,16 +2770,6 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g return strptr; } -/* - * Get an ASCII string; this should not be used in new code. - */ -guint8 * -tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, - gint *lengthp) -{ - return tvb_get_ascii_stringz(scope, tvb, offset, lengthp); -} - /* Looks for a stringz (NUL-terminated string) in tvbuff and copies * no more than bufsize number of bytes, including terminating NUL, to buffer. * Returns length of string (not including terminating NUL), or -1 if the string was diff --git a/epan/tvbuff.h b/epan/tvbuff.h index 9e15612105..d1eb66216e 100644 --- a/epan/tvbuff.h +++ b/epan/tvbuff.h @@ -474,18 +474,22 @@ extern gchar *tvb_format_stringzpad(tvbuff_t *tvb, const gint offset, extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset, const gint size); - /** - * Given a tvbuff, an offset, and a length, allocate a buffer big enough - * to hold a string of length characters plus a trailing '\0'. Copy length - * characters, starting at offset, from the tvbuff into the buffer and return - * a pointer to the buffer. + * Given an allocator scope, a tvbuff, a byte offset, a byte length, and + * a string encoding, with the specified offset and length referring to + * a string in the specified encoding: + * + * allocate a buffer using the specified scope; + * + * convert the string from the specified encoding to UTF-8, possibly + * mapping some characters or invalid octet sequences to the Unicode + * REPLACEMENT CHARACTER, and put the resulting UTF-8 string, plus a + * trailing '\0', into that buffer; + * + * and return a pointer to the buffer. * * Throws an exception if the tvbuff ends before the string does. * - * Takes a string encoding as well, and converts to UTF-8 from the encoding, - * possibly mapping some characters to the Unicode REPLACEMENT CHARACTER. - * * If scope is set to NULL it is the user's responsibility to wmem_free() * the memory allocated. Otherwise memory is automatically freed when the scope * lifetime is reached. @@ -493,16 +497,31 @@ extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset, WS_DLL_PUBLIC guint8 *tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length, const guint encoding); -/* DEPRECATED, do not use in new code, call tvb_get_string_enc directly! */ +/* + * DEPRECATED, do not use in new code, call tvb_get_string_enc directly with + * the appropriate extension! Do not assume that ENC_ASCII will work + * with arbitrary string encodings; it will map all bytes with the 8th + * bit set to the Unicode REPLACEMENT CHARACTER, so it won't show non-ASCII + * characters as anything other than an ugly blob. + */ #define tvb_get_string(SCOPE, TVB, OFFSET, LENGTH) \ tvb_get_string_enc(SCOPE, TVB, OFFSET, LENGTH, ENC_ASCII) /** - * Given a tvbuff, a bit offset, and a number of characters, allocate - * a buffer big enough to hold a non-null-terminated string of no_of_chars - * encoded according to 3GPP TS 23.038 7bits encoding at that offset, - * plus a trailing zero, copy the string into it, and return a pointer - * to the string. + * Given an allocator scope, a tvbuff, a bit offset, and a length in + * 7-bit characters (not octets!), with the specified offset and + * length referring to a string in the 3GPP TS 23.038 7bits encoding: + * + * allocate a buffer using the specified scope; + * + * convert the string from the specified encoding to UTF-8, possibly + * mapping some characters or invalid octet sequences to the Unicode + * REPLACEMENT CHARACTER, and put the resulting UTF-8 string, plus a + * trailing '\0', into that buffer; + * + * and return a pointer to the buffer. + * + * Throws an exception if the tvbuff ends before the string does. * * If scope is set to NULL it is the user's responsibility to g_free() * the memory allocated by tvb_memdup(). Otherwise memory is @@ -511,6 +530,45 @@ WS_DLL_PUBLIC guint8 *tvb_get_string_enc(wmem_allocator_t *scope, WS_DLL_PUBLIC gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint bit_offset, gint no_of_chars); +/** + * Given an allocator scope, a tvbuff, a byte offset, a pointer to a + * gint, and a string encoding, with the specified offset referring to + * a null-terminated string in the specified encoding: + * + * find the length of that string (and throw an exception if the tvbuff + * ends before we find the null); + * + * allocate a buffer using the specified scope; + * + * convert the string from the specified encoding to UTF-8, possibly + * mapping some characters or invalid octet sequences to the Unicode + * REPLACEMENT CHARACTER, and put the resulting UTF-8 string, plus a + * trailing '\0', into that buffer; + * + * if the pointer to the gint is non-null, set the gint to which it + * points to the length of the string; + * + * and return a pointer to the buffer. + * + * Throws an exception if the tvbuff ends before the string does. + * + * If scope is set to NULL it is the user's responsibility to wmem_free() + * the memory allocated. Otherwise memory is automatically freed when the scope + * lifetime is reached. + */ +WS_DLL_PUBLIC guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope, + tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding); + +/* + * DEPRECATED, do not use in new code, call tvb_get_string_enc directly with + * the appropriate extension! Do not assume that ENC_ASCII will work + * with arbitrary string encodings; it will map all bytes with the 8th + * bit set to the Unicode REPLACEMENT CHARACTER, so it won't show non-ASCII + * characters as anything other than an ugly blob. + */ +#define tvb_get_stringz(SCOPE, TVB, OFFSET, LENGTHP) \ + tvb_get_stringz_enc(SCOPE, TVB, OFFSET, LENGTHP, ENC_ASCII) + /** * Given a tvbuff and an offset, with the offset assumed to refer to * a null-terminated string, find the length of that string (and throw @@ -519,27 +577,16 @@ WS_DLL_PUBLIC gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope, * and return a pointer to the string. Also return the length of the * string (including the terminating null) through a pointer. * - * tvb_get_stringz() handles 7-bit ASCII strings, with characters - * with the 8th bit set are converted to the - * Unicode REPLACEMENT CHARACTER. + * This returns a constant (unmodifiable) string that does not need + * to be freed; instead, it will automatically be freed once the next + * packet is dissected. * - * tvb_get_stringz_enc() takes a string encoding as well, and converts to UTF-8 - * from the encoding, possibly mapping some characters - * to the REPLACEMENT CHARACTER. - * - * tvb_get_const_stringz() returns a constant (unmodifiable) string that does - * not need to be freed, instead it will automatically be - * freed once the next packet is dissected. It is slightly - * more efficient than the other routines. - * - * If scope is set to NULL it is the user's responsibility to g_free() - * the memory allocated by tvb_memdup(). Otherwise memory is - * automatically freed when the scope lifetime is reached. + * It is slightly more efficient than the other routines, but does *NOT* + * do any translation to UTF-8 - the string consists of the raw octets + * of the string, in whatever encoding they happen to be in, and, if + * the string is not valid in that encoding, with invalid octet sequences + * as they are in the packet. */ -WS_DLL_PUBLIC guint8 *tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, - const gint offset, gint *lengthp); -WS_DLL_PUBLIC guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope, - tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding); WS_DLL_PUBLIC const guint8 *tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp); diff --git a/tools/checkAPIs.pl b/tools/checkAPIs.pl index e21f75323d..3f8285e7d3 100755 --- a/tools/checkAPIs.pl +++ b/tools/checkAPIs.pl @@ -132,6 +132,7 @@ my %APIs = ( 'tvb_length_remaining', # replaced with tvb_captured_length_remaining 'tvb_ensure_length_remaining', # replaced with tvb_ensure_captured_length_remaining 'tvb_get_string', # replaced with tvb_get_string_enc + 'tvb_get_stringz', # replaced with tvb_get_stringz_enc # wmem calls should replace all emem calls (see doc/README.wmem) 'ep_alloc',