Add some UTF-8 debug checks with a compile time flag

Some older dissectors that predate Unicode and parse text protocols are prone to generate invalid UTF-8 strings. This is a bug and can have safety implications. For example passing invalid UTF-8 to proto_tree_add_string() is a common bug. There are safeguards in format_text() but this should not be relied on as a general solution to the problem. For one, as the name implies, it is only used with representation of a field value, which is not the same as the value itself of an FT_STRING field. Issue #18317 shows another reason why. For now this compile flag only enables extra checks for string ftypes, which covers a subset of proto.h APIs including proto_tree_append_string(). Later is should be extended to other interfaces. This is also not expected to be disabled for release builds because there are still many dissectors that do not correctly handle strings. More work is needed to 1) identify them and 2) fix them. Ping #18317
2022-09-26 22:39:01 +01:00 · 2022-09-26 22:39:01 +01:00 · 6d06d4e46b
parent 7b53fd127e
commit 6d06d4e46b
7 changed files with 63 additions and 18 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -466,6 +466,7 @@ set_property(DIRECTORY
 		"G_DISABLE_SINGLE_INCLUDES"
 		$<$<OR:$<NOT:$<BOOL:${ENABLE_ASSERT}>>,$<CONFIG:Release>>:WS_DISABLE_ASSERT>
 		$<$<OR:$<NOT:$<BOOL:${ENABLE_DEBUG}>>,$<CONFIG:Release>>:WS_DISABLE_DEBUG>
+		$<$<BOOL:${ENABLE_DEBUG_MBS}>:WS_DEBUG_UTF_8>
 )

 if(WIN32)
--- a/CMakeOptions.txt
+++ b/CMakeOptions.txt
@ -54,6 +54,8 @@ option(BUILD_fuzzshark     "Build fuzzshark" OFF)
 option(ENABLE_WERROR     "Treat warnings as errors" ON)
 option(ENABLE_DEBUG      "Enable debug code" ON)
 option(ENABLE_ASSERT     "Enable assertions" ON)
+option(ENABLE_DEBUG_MBS  "Enable extra debug checks for detecting invalid multibyte (UTF-8) strings" ON)
+
 option(ENABLE_CCACHE     "Speed up compiling and linking using ccache if possible" OFF)
 option(DISABLE_FRAME_LARGER_THAN_WARNING "Disable warning if the size of a function frame is large" OFF)
 option(EXTCAP_ANDROIDDUMP_LIBPCAP    "Build androiddump using libpcap" OFF)
--- a/epan/ftypes/ftype-string.c
+++ b/epan/ftypes/ftype-string.c
@ -15,6 +15,23 @@
 #include <strutil.h>
 #include <wsutil/ws_assert.h>

+
+#ifdef WS_DEBUG_UTF_8
+static inline void
+string_validate_utf8(fvalue_t *fv)
+{
+	if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) {
+		ws_warning("String fvalues must use a valid UTF-8 encoding."
+				" This string has been sanitized to look like this: %s",
+				wmem_strbuf_get_str(fv->value.strbuf));
+	}
+}
+#define CHECK_UTF_8(fv) string_validate_utf8(fv)
+#else /* !WS_DEBUG_UTF_8 */
+#define CHECK_UTF_8(fv)  (void)(fv)
+#endif /* WS_DEBUG_UTF_8 */
+
+
 static void
 string_fvalue_new(fvalue_t *fv)
 {
@ -42,6 +59,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value)
 	string_fvalue_free(fv);

 	fv->value.strbuf = value;
+	CHECK_UTF_8(fv);
 }

 static char *
@ -73,20 +91,19 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_)
 		fv->value.strbuf = wmem_strbuf_new_len(NULL, s, len);
 	else
 		fv->value.strbuf = wmem_strbuf_new(NULL, s);
+
+	CHECK_UTF_8(fv);
 	return TRUE;
 }

 static gboolean
-val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg _U_)
+val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg)
 {
 	/* Just turn it into a string */
 	/* XXX Should probably be a syntax error instead. It's more user-friendly to ask the
 	 * user to be explicit about the meaning of an unquoted literal than them trying to figure out
 	 * why a valid filter expression is giving wrong results. */
-	string_fvalue_free(fv);
-
-	fv->value.strbuf = wmem_strbuf_new(NULL, s);
-	return TRUE;
+	return val_from_string(fv, s, 0, err_msg);
 }

 static gboolean
--- a/epan/proto.c
+++ b/epan/proto.c
@ -5002,17 +5002,6 @@ proto_tree_add_string(proto_tree *tree, int hfindex, tvbuff_t *tvb, gint start,

 	pi = proto_tree_add_pi(tree, hfinfo, tvb, start, &length);
 	DISSECTOR_ASSERT(length >= 0);
-	/* XXX: We could have a preference or a define to turn off
-	 * validation (which is slightly slow) and trust subdissectors
-	 * to validate strings passed in.
-	 */
-	if (value) {
-		if (!g_utf8_validate(value, -1, NULL)) {
-			proto_tree_set_string(PNODE_FINFO(pi), get_utf_8_string(PNODE_POOL(tree), value, (int)(strlen(value))));
-			/* We could turn this assertion off, and just sanitize */
-			DISSECTOR_ASSERT_HINT(0, "Unsanitized UTF-8 string");
-		}
-	}
 	proto_tree_set_string(PNODE_FINFO(pi), value);

 	return pi;
@ -5062,6 +5051,7 @@ static void
 proto_tree_set_string(field_info *fi, const char* value)
 {
 	if (value) {
+		/* String must be valid UTF-8. It is sanitized otherwise (if enabled at compile time). */
 		fvalue_set_string(&fi->value, value);
 	} else {
 		/*
--- a/ui/version_info.c
+++ b/ui/version_info.c
@ -215,6 +215,10 @@ get_compiled_version_info(gather_feature_func gather_compile)
 	g_string_append(str, ", without assertions");
 #endif

+#ifdef WS_DEBUG_UTF_8
+	g_string_append(str, ", with UTF-8 validation");
+#endif
+
 	g_string_append(str, ".");
 	end_string(str);
 	free_features(&l);
--- a/wsutil/wmem/wmem_strbuf.c
+++ b/wsutil/wmem/wmem_strbuf.c
@ -331,6 +331,27 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf)
    wmem_free(strbuf->allocator, strbuf);
 }

+bool
+wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
+{
+    if (g_utf8_validate(strbuf->str, -1, NULL)) {
+        return false;
+    }
+
+    /* Sanitize the contents to a temporary string. */
+    char *tmp = g_utf8_make_valid(strbuf->str, -1);
+
+    /* Reset the strbuf, keeping the backing memory allocation */
+    *strbuf->str = '\0';
+    strbuf->len = 0;
+
+    /* Copy the temporary string to the strbuf. */
+    wmem_strbuf_append(strbuf, tmp);
+    g_free(tmp);
+
+    return true;
+}
+
 /*
 * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
 *
--- a/wsutil/wmem/wmem_strbuf.h
+++ b/wsutil/wmem/wmem_strbuf.h
@ -12,8 +12,7 @@
 #ifndef __WMEM_STRBUF_H__
 #define __WMEM_STRBUF_H__

-#include <string.h>
-#include <glib.h>
+#include <wireshark.h>

 #include "wmem_core.h"

@ -138,6 +137,17 @@ WS_DLL_PUBLIC
 void
 wmem_strbuf_destroy(wmem_strbuf_t *strbuf);

+/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed,
+ * by replacing encoding errors with unicode replacement character. This function is
+ * intended for debugging purposes and is not optimized for speed.
+ *
+ * @param strbuf the strbuf to validate
+ * @return true if the string was sanitized, false otherwise
+ */
+WS_DLL_PUBLIC
+bool
+wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf);
+
 /**   @}
 *  @} */