From 6d06d4e46bc9bf5c133ca4b039be329ec1fe5067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Valverde?= Date: Mon, 26 Sep 2022 22:39:01 +0100 Subject: [PATCH] Add some UTF-8 debug checks with a compile time flag Some older dissectors that predate Unicode and parse text protocols are prone to generate invalid UTF-8 strings. This is a bug and can have safety implications. For example passing invalid UTF-8 to proto_tree_add_string() is a common bug. There are safeguards in format_text() but this should not be relied on as a general solution to the problem. For one, as the name implies, it is only used with representation of a field value, which is not the same as the value itself of an FT_STRING field. Issue #18317 shows another reason why. For now this compile flag only enables extra checks for string ftypes, which covers a subset of proto.h APIs including proto_tree_append_string(). Later is should be extended to other interfaces. This is also not expected to be disabled for release builds because there are still many dissectors that do not correctly handle strings. More work is needed to 1) identify them and 2) fix them. Ping #18317 --- CMakeLists.txt | 1 + CMakeOptions.txt | 2 ++ epan/ftypes/ftype-string.c | 27 ++++++++++++++++++++++----- epan/proto.c | 12 +----------- ui/version_info.c | 4 ++++ wsutil/wmem/wmem_strbuf.c | 21 +++++++++++++++++++++ wsutil/wmem/wmem_strbuf.h | 14 ++++++++++++-- 7 files changed, 63 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a37156a55b..de511804dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -466,6 +466,7 @@ set_property(DIRECTORY "G_DISABLE_SINGLE_INCLUDES" $<$>,$>:WS_DISABLE_ASSERT> $<$>,$>:WS_DISABLE_DEBUG> + $<$:WS_DEBUG_UTF_8> ) if(WIN32) diff --git a/CMakeOptions.txt b/CMakeOptions.txt index 407b4b9f98..a3c57322d8 100644 --- a/CMakeOptions.txt +++ b/CMakeOptions.txt @@ -54,6 +54,8 @@ option(BUILD_fuzzshark "Build fuzzshark" OFF) option(ENABLE_WERROR "Treat warnings as errors" ON) option(ENABLE_DEBUG "Enable debug code" ON) option(ENABLE_ASSERT "Enable assertions" ON) +option(ENABLE_DEBUG_MBS "Enable extra debug checks for detecting invalid multibyte (UTF-8) strings" ON) + option(ENABLE_CCACHE "Speed up compiling and linking using ccache if possible" OFF) option(DISABLE_FRAME_LARGER_THAN_WARNING "Disable warning if the size of a function frame is large" OFF) option(EXTCAP_ANDROIDDUMP_LIBPCAP "Build androiddump using libpcap" OFF) diff --git a/epan/ftypes/ftype-string.c b/epan/ftypes/ftype-string.c index c2789a2422..017a71fb94 100644 --- a/epan/ftypes/ftype-string.c +++ b/epan/ftypes/ftype-string.c @@ -15,6 +15,23 @@ #include #include + +#ifdef WS_DEBUG_UTF_8 +static inline void +string_validate_utf8(fvalue_t *fv) +{ + if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) { + ws_warning("String fvalues must use a valid UTF-8 encoding." + " This string has been sanitized to look like this: %s", + wmem_strbuf_get_str(fv->value.strbuf)); + } +} +#define CHECK_UTF_8(fv) string_validate_utf8(fv) +#else /* !WS_DEBUG_UTF_8 */ +#define CHECK_UTF_8(fv) (void)(fv) +#endif /* WS_DEBUG_UTF_8 */ + + static void string_fvalue_new(fvalue_t *fv) { @@ -42,6 +59,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value) string_fvalue_free(fv); fv->value.strbuf = value; + CHECK_UTF_8(fv); } static char * @@ -73,20 +91,19 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_) fv->value.strbuf = wmem_strbuf_new_len(NULL, s, len); else fv->value.strbuf = wmem_strbuf_new(NULL, s); + + CHECK_UTF_8(fv); return TRUE; } static gboolean -val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg _U_) +val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg) { /* Just turn it into a string */ /* XXX Should probably be a syntax error instead. It's more user-friendly to ask the * user to be explicit about the meaning of an unquoted literal than them trying to figure out * why a valid filter expression is giving wrong results. */ - string_fvalue_free(fv); - - fv->value.strbuf = wmem_strbuf_new(NULL, s); - return TRUE; + return val_from_string(fv, s, 0, err_msg); } static gboolean diff --git a/epan/proto.c b/epan/proto.c index d2b9737510..c66aed6aca 100644 --- a/epan/proto.c +++ b/epan/proto.c @@ -5002,17 +5002,6 @@ proto_tree_add_string(proto_tree *tree, int hfindex, tvbuff_t *tvb, gint start, pi = proto_tree_add_pi(tree, hfinfo, tvb, start, &length); DISSECTOR_ASSERT(length >= 0); - /* XXX: We could have a preference or a define to turn off - * validation (which is slightly slow) and trust subdissectors - * to validate strings passed in. - */ - if (value) { - if (!g_utf8_validate(value, -1, NULL)) { - proto_tree_set_string(PNODE_FINFO(pi), get_utf_8_string(PNODE_POOL(tree), value, (int)(strlen(value)))); - /* We could turn this assertion off, and just sanitize */ - DISSECTOR_ASSERT_HINT(0, "Unsanitized UTF-8 string"); - } - } proto_tree_set_string(PNODE_FINFO(pi), value); return pi; @@ -5062,6 +5051,7 @@ static void proto_tree_set_string(field_info *fi, const char* value) { if (value) { + /* String must be valid UTF-8. It is sanitized otherwise (if enabled at compile time). */ fvalue_set_string(&fi->value, value); } else { /* diff --git a/ui/version_info.c b/ui/version_info.c index 80c33e6b5b..40b1950614 100644 --- a/ui/version_info.c +++ b/ui/version_info.c @@ -215,6 +215,10 @@ get_compiled_version_info(gather_feature_func gather_compile) g_string_append(str, ", without assertions"); #endif +#ifdef WS_DEBUG_UTF_8 + g_string_append(str, ", with UTF-8 validation"); +#endif + g_string_append(str, "."); end_string(str); free_features(&l); diff --git a/wsutil/wmem/wmem_strbuf.c b/wsutil/wmem/wmem_strbuf.c index 5164129bcb..adee649789 100644 --- a/wsutil/wmem/wmem_strbuf.c +++ b/wsutil/wmem/wmem_strbuf.c @@ -331,6 +331,27 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf) wmem_free(strbuf->allocator, strbuf); } +bool +wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf) +{ + if (g_utf8_validate(strbuf->str, -1, NULL)) { + return false; + } + + /* Sanitize the contents to a temporary string. */ + char *tmp = g_utf8_make_valid(strbuf->str, -1); + + /* Reset the strbuf, keeping the backing memory allocation */ + *strbuf->str = '\0'; + strbuf->len = 0; + + /* Copy the temporary string to the strbuf. */ + wmem_strbuf_append(strbuf, tmp); + g_free(tmp); + + return true; +} + /* * Editor modelines - https://www.wireshark.org/tools/modelines.html * diff --git a/wsutil/wmem/wmem_strbuf.h b/wsutil/wmem/wmem_strbuf.h index b9b8a818d7..efcde4a479 100644 --- a/wsutil/wmem/wmem_strbuf.h +++ b/wsutil/wmem/wmem_strbuf.h @@ -12,8 +12,7 @@ #ifndef __WMEM_STRBUF_H__ #define __WMEM_STRBUF_H__ -#include -#include +#include #include "wmem_core.h" @@ -138,6 +137,17 @@ WS_DLL_PUBLIC void wmem_strbuf_destroy(wmem_strbuf_t *strbuf); +/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed, + * by replacing encoding errors with unicode replacement character. This function is + * intended for debugging purposes and is not optimized for speed. + * + * @param strbuf the strbuf to validate + * @return true if the string was sanitized, false otherwise + */ +WS_DLL_PUBLIC +bool +wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf); + /** @} * @} */