From 8d234a0d8c6c974a374e36a58cd7b3d699866464 Mon Sep 17 00:00:00 2001
From: Guy Harris <guy@alum.mit.edu>
Date: Sun, 23 Feb 2014 14:16:24 -0800
Subject: [PATCH] More tvbuff API deprecation, comment expansion, and
 documentation updates.

Do with tvb_get_stringz() what was done with tvb_get_string().

Redo the comments for the string get routines to try to give more detail
in a fashion that's a bit less hard to read.

Warn, in comments, of the problems with using
tvb_get_string()/tvb_get_stringz() (i.e., if your strings are non-ASCII,
all bytes with the 8th bit set are going be replaced by the Unicode
REPLACEMENT CHARACTER, and displayed as such).

Warn, in a comment, of the problems with tvb_get_const_stringz() (i.e.,
it gives you raw bytes, rather than guaranteed-to-be-valid UTF-8).

Update documentation and release notes appropriately.

Change-Id: Ibd3efb92a203861f507ce71bc8d04d19d9d38a93
Reviewed-on: https://code.wireshark.org/review/327
Reviewed-by: Guy Harris <guy@alum.mit.edu>
---
 doc/README.dissector           |  54 ++++++++++------
 docbook/release-notes.asciidoc |  11 ++--
 epan/tvbuff.c                  |  10 ---
 epan/tvbuff.h                  | 113 +++++++++++++++++++++++----------
 tools/checkAPIs.pl             |   1 +
 5 files changed, 123 insertions(+), 66 deletions(-)

diff --git a/doc/README.dissector b/doc/README.dissector
index 321f87288d..5f5eb789a5 100644
--- a/doc/README.dissector
+++ b/doc/README.dissector
@@ -256,9 +256,38 @@ data from the specified tvbuff, starting at the specified offset, and containing
 the specified length worth of characters. Reads data in the specified encoding
 and produces UTF-8 in the buffer. See below for a list of input encoding values.
 
-guint8 *tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp);
+The buffer is allocated by g_malloc() if scope is set to NULL (in that
+case memory must be explicitely freed), or with the allocator lifetime
+if scope is not NULL.
+
 guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding);
+
+Returns a null-terminated buffer allocated from the specified scope,
+containing data from the specified tvbuff, starting at the specified
+offset, and containing all characters from the tvbuff up to and
+including a terminating null character in the tvbuff.  Reads data in the
+specified encoding and produces UTF-8 in the buffer.  See below for a
+list of input encoding values.  "*lengthp" will be set to the length of
+the string, including the terminating null.
+
+The buffer is allocated by g_malloc() if scope is set to NULL (in that
+case memory must be explicitely freed), or with the allocator lifetime
+if scope is not NULL.
+
 const guint8 *tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp);
+
+Returns a null-terminated const buffer containing data from the
+specified tvbuff, starting at the specified offset, and containing all
+bytes from the tvbuff up to and including a terminating null character
+in the tvbuff.  "*lengthp" will be set to the length of the string,
+including the terminating null.
+
+You do not need to free() this buffer; it will happen automatically once
+the next packet is dissected.  This function is slightly more efficient
+than the others because it does not allocate memory and copy the string,
+but it does not do any mapping to UTF-8 or checks for valid octet
+sequences.
+
 gint tvb_get_nstringz(tvbuff_t *tvb, const gint offset, const guint bufsize, guint8* buffer);
 gint tvb_get_nstringz0(tvbuff_t *tvb, const gint offset, const guint bufsize, guint8* buffer);
 
@@ -268,26 +297,15 @@ tvbuff up to and including a terminating null character in the tvbuff.
 "*lengthp" will be set to the length of the string, including the terminating
 null.
 
-tvb_get_stringz() returns a buffer allocated by g_malloc() if scope is set
-to NULL (in that case memory must be explicitely freed), or with the
-allocator lifetime if scope is not NULL.
-
-tvb_get_stringz_enc() is a version of tvb_get_stringz() that takes a
-string encoding as an argument.  See below for a list of encoding values
-for strings.
-
-tvb_get_const_stringz() returns a pointer to the (const) string in the tvbuff.
-You do not need to free() this buffer, it will happen automatically once the
-next packet is dissected.  This function is slightly more efficient than the
-others because it does not allocate memory and copy the string.
-
 gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope,
     tvbuff_t *tvb, const gint bit_offset, gint no_of_chars);
 
-tvb_get_ts_23_038_7bits_string() returns a string of a given number of characters
-and encoded according to 3GPP TS 23.038 7 bits alphabet. The buffer allocated by
-g_malloc() if scope is set to NULL (in that case memory must be explicitely freed),
-or with the allocator lifetime if scope is not NULL.
+tvb_get_ts_23_038_7bits_string() returns a string of a given number of
+characters and encoded according to 3GPP TS 23.038 7 bits alphabet.
+
+The buffer is allocated by g_malloc() if scope is set to NULL (in that
+case memory must be explicitely freed), or with the allocator lifetime
+if scope is not NULL.
 
 Byte Array Accessors:
 
diff --git a/docbook/release-notes.asciidoc b/docbook/release-notes.asciidoc
index 1c0dbbd322..fd09d9903c 100644
--- a/docbook/release-notes.asciidoc
+++ b/docbook/release-notes.asciidoc
@@ -162,13 +162,14 @@ STANAG 5066 Data Transfer Sublayer
 
 The libwireshark API has undergone some major changes:
 
-* A more flexible, modular memory manger (wmem) has been added. It was available
-  experimentally in 1.10 but is now mature and has mostly replaced the old emem
-  API (which is deprecated).
+* A more flexible, modular memory manager (wmem) has been added. It was
+  available experimentally in 1.10 but is now mature and has mostly
+  replaced the old emem API (which is deprecated).
 * A new API for expert information has been added, replacing the old one.
 * The tvbuff API has been cleaned up: tvb_length has been renamed to
-  tvb_captured_length for clarity, and tvb_get_string has been deprecated in
-  favour of tvb_get_string_enc.
+  tvb_captured_length for clarity, and tvb_get_string and tvb_get_stringz
+  have been deprecated in favour of tvb_get_string_enc and
+  tvb_get_stringz_enc.
 
 == Getting Wireshark
 
diff --git a/epan/tvbuff.c b/epan/tvbuff.c
index 945ab95f9c..6458382497 100644
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@@ -2770,16 +2770,6 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
 	return strptr;
 }
 
-/*
- * Get an ASCII string; this should not be used in new code.
- */
-guint8 *
-tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
-			     gint *lengthp)
-{
-	return tvb_get_ascii_stringz(scope, tvb, offset, lengthp);
-}
-
 /* Looks for a stringz (NUL-terminated string) in tvbuff and copies
  * no more than bufsize number of bytes, including terminating NUL, to buffer.
  * Returns length of string (not including terminating NUL), or -1 if the string was
diff --git a/epan/tvbuff.h b/epan/tvbuff.h
index 9e15612105..d1eb66216e 100644
--- a/epan/tvbuff.h
+++ b/epan/tvbuff.h
@@ -474,18 +474,22 @@ extern gchar *tvb_format_stringzpad(tvbuff_t *tvb, const gint offset,
 extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset,
     const gint size);
 
-
 /**
- * Given a tvbuff, an offset, and a length, allocate a buffer big enough
- * to hold a string of length characters plus a trailing '\0'. Copy length
- * characters, starting at offset, from the tvbuff into the buffer and return
- * a pointer to the buffer.
+ * Given an allocator scope, a tvbuff, a byte offset, a byte length, and
+ * a string encoding, with the specified offset and length referring to
+ * a string in the specified encoding:
+ *
+ *    allocate a buffer using the specified scope;
+ *
+ *    convert the string from the specified encoding to UTF-8, possibly
+ *    mapping some characters or invalid octet sequences to the Unicode
+ *    REPLACEMENT CHARACTER, and put the resulting UTF-8 string, plus a
+ *    trailing '\0', into that buffer;
+ *
+ *    and return a pointer to the buffer.
  *
  * Throws an exception if the tvbuff ends before the string does.
  *
- * Takes a string encoding as well, and converts to UTF-8 from the encoding,
- * possibly mapping some characters to the Unicode REPLACEMENT CHARACTER.
- *
  * If scope is set to NULL it is the user's responsibility to wmem_free()
  * the memory allocated. Otherwise memory is automatically freed when the scope
  * lifetime is reached.
@@ -493,16 +497,31 @@ extern gchar *tvb_format_stringzpad_wsp(tvbuff_t *tvb, const gint offset,
 WS_DLL_PUBLIC guint8 *tvb_get_string_enc(wmem_allocator_t *scope,
     tvbuff_t *tvb, const gint offset, const gint length, const guint encoding);
 
-/* DEPRECATED, do not use in new code, call tvb_get_string_enc directly! */
+/*
+ * DEPRECATED, do not use in new code, call tvb_get_string_enc directly with
+ * the appropriate extension!  Do not assume that ENC_ASCII will work
+ * with arbitrary string encodings; it will map all bytes with the 8th
+ * bit set to the Unicode REPLACEMENT CHARACTER, so it won't show non-ASCII
+ * characters as anything other than an ugly blob.
+ */
 #define tvb_get_string(SCOPE, TVB, OFFSET, LENGTH) \
     tvb_get_string_enc(SCOPE, TVB, OFFSET, LENGTH, ENC_ASCII)
 
 /**
- * Given a tvbuff, a bit offset, and a number of characters, allocate
- * a buffer big enough to hold a non-null-terminated string of no_of_chars
- * encoded according to 3GPP TS 23.038 7bits encoding at that offset,
- * plus a trailing zero, copy the string into it, and return a pointer
- * to the string.
+ * Given an allocator scope, a tvbuff, a bit offset, and a length in
+ * 7-bit characters (not octets!), with the specified offset and
+ * length referring to a string in the 3GPP TS 23.038 7bits encoding:
+ *
+ *    allocate a buffer using the specified scope;
+ *
+ *    convert the string from the specified encoding to UTF-8, possibly
+ *    mapping some characters or invalid octet sequences to the Unicode
+ *    REPLACEMENT CHARACTER, and put the resulting UTF-8 string, plus a
+ *    trailing '\0', into that buffer;
+ *
+ *    and return a pointer to the buffer.
+ *
+ * Throws an exception if the tvbuff ends before the string does.
  *
  * If scope is set to NULL it is the user's responsibility to g_free()
  * the memory allocated by tvb_memdup(). Otherwise memory is
@@ -511,6 +530,45 @@ WS_DLL_PUBLIC guint8 *tvb_get_string_enc(wmem_allocator_t *scope,
 WS_DLL_PUBLIC gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope,
     tvbuff_t *tvb, const gint bit_offset, gint no_of_chars);
 
+/**
+ * Given an allocator scope, a tvbuff, a byte offset, a pointer to a
+ * gint, and a string encoding, with the specified offset referring to
+ * a null-terminated string in the specified encoding:
+ *
+ *    find the length of that string (and throw an exception if the tvbuff
+ *    ends before we find the null);
+ *
+ *    allocate a buffer using the specified scope;
+ *
+ *    convert the string from the specified encoding to UTF-8, possibly
+ *    mapping some characters or invalid octet sequences to the Unicode
+ *    REPLACEMENT CHARACTER, and put the resulting UTF-8 string, plus a
+ *    trailing '\0', into that buffer;
+ *
+ *    if the pointer to the gint is non-null, set the gint to which it
+ *    points to the length of the string;
+ *
+ *    and return a pointer to the buffer.
+ *
+ * Throws an exception if the tvbuff ends before the string does.
+ *
+ * If scope is set to NULL it is the user's responsibility to wmem_free()
+ * the memory allocated. Otherwise memory is automatically freed when the scope
+ * lifetime is reached.
+ */
+WS_DLL_PUBLIC guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope,
+    tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding);
+
+/*
+ * DEPRECATED, do not use in new code, call tvb_get_string_enc directly with
+ * the appropriate extension!  Do not assume that ENC_ASCII will work
+ * with arbitrary string encodings; it will map all bytes with the 8th
+ * bit set to the Unicode REPLACEMENT CHARACTER, so it won't show non-ASCII
+ * characters as anything other than an ugly blob.
+ */
+#define tvb_get_stringz(SCOPE, TVB, OFFSET, LENGTHP) \
+    tvb_get_stringz_enc(SCOPE, TVB, OFFSET, LENGTHP, ENC_ASCII)
+
 /**
  * Given a tvbuff and an offset, with the offset assumed to refer to
  * a null-terminated string, find the length of that string (and throw
@@ -519,27 +577,16 @@ WS_DLL_PUBLIC gchar *tvb_get_ts_23_038_7bits_string(wmem_allocator_t *scope,
  * and return a pointer to the string.  Also return the length of the
  * string (including the terminating null) through a pointer.
  *
- * tvb_get_stringz() handles 7-bit ASCII strings, with characters
- *                   with the 8th bit set are converted to the
- *                   Unicode REPLACEMENT CHARACTER.
+ * This returns a constant (unmodifiable) string that does not need
+ * to be freed; instead, it will automatically be freed once the next
+ * packet is dissected.
  *
- * tvb_get_stringz_enc() takes a string encoding as well, and converts to UTF-8
- *                   from the encoding, possibly mapping some characters
- *                   to the REPLACEMENT CHARACTER.
- *
- * tvb_get_const_stringz() returns a constant (unmodifiable) string that does
- *                   not need to be freed, instead it will automatically be
- *                   freed once the next packet is dissected.  It is slightly
- *                   more efficient than the other routines.
- *
- * If scope is set to NULL it is the user's responsibility to g_free()
- * the memory allocated by tvb_memdup(). Otherwise memory is
- * automatically freed when the scope lifetime is reached.
+ * It is slightly more efficient than the other routines, but does *NOT*
+ * do any translation to UTF-8 - the string consists of the raw octets
+ * of the string, in whatever encoding they happen to be in, and, if
+ * the string is not valid in that encoding, with invalid octet sequences
+ * as they are in the packet.
  */
-WS_DLL_PUBLIC guint8 *tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb,
-    const gint offset, gint *lengthp);
-WS_DLL_PUBLIC guint8 *tvb_get_stringz_enc(wmem_allocator_t *scope,
-    tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding);
 WS_DLL_PUBLIC const guint8 *tvb_get_const_stringz(tvbuff_t *tvb,
     const gint offset, gint *lengthp);
 
diff --git a/tools/checkAPIs.pl b/tools/checkAPIs.pl
index e21f75323d..3f8285e7d3 100755
--- a/tools/checkAPIs.pl
+++ b/tools/checkAPIs.pl
@@ -132,6 +132,7 @@ my %APIs = (
                 'tvb_length_remaining', # replaced with tvb_captured_length_remaining
                 'tvb_ensure_length_remaining', # replaced with tvb_ensure_captured_length_remaining
                 'tvb_get_string', # replaced with tvb_get_string_enc
+                'tvb_get_stringz', # replaced with tvb_get_stringz_enc
 
                 # wmem calls should replace all emem calls (see doc/README.wmem)
                 'ep_alloc',