Add some UTF-8 debug checks with a compile time flag

Some older dissectors that predate Unicode and parse text protocols
are prone to generate invalid UTF-8 strings. This is a bug and can have
safety implications.

For example passing invalid UTF-8 to proto_tree_add_string() is a
common bug. There are safeguards in format_text() but this should
not be relied on as a general solution to the problem.

For one, as the name implies, it is only used with representation of a
field value, which is not the same as the value itself of an FT_STRING field.
Issue #18317 shows another reason why.

For now this compile flag only enables extra checks for string ftypes,
which covers a subset of proto.h APIs including
proto_tree_append_string(). Later is should be extended to other
interfaces.

This is also not expected to be disabled for release builds because
there are still many dissectors that do not correctly handle strings.
More work is needed to 1) identify them and 2) fix them.

Ping #18317
This commit is contained in:
João Valverde 2022-09-26 22:39:01 +01:00
parent 7b53fd127e
commit 6d06d4e46b
7 changed files with 63 additions and 18 deletions

View File

@ -466,6 +466,7 @@ set_property(DIRECTORY
"G_DISABLE_SINGLE_INCLUDES"
$<$<OR:$<NOT:$<BOOL:${ENABLE_ASSERT}>>,$<CONFIG:Release>>:WS_DISABLE_ASSERT>
$<$<OR:$<NOT:$<BOOL:${ENABLE_DEBUG}>>,$<CONFIG:Release>>:WS_DISABLE_DEBUG>
$<$<BOOL:${ENABLE_DEBUG_MBS}>:WS_DEBUG_UTF_8>
)
if(WIN32)

View File

@ -54,6 +54,8 @@ option(BUILD_fuzzshark "Build fuzzshark" OFF)
option(ENABLE_WERROR "Treat warnings as errors" ON)
option(ENABLE_DEBUG "Enable debug code" ON)
option(ENABLE_ASSERT "Enable assertions" ON)
option(ENABLE_DEBUG_MBS "Enable extra debug checks for detecting invalid multibyte (UTF-8) strings" ON)
option(ENABLE_CCACHE "Speed up compiling and linking using ccache if possible" OFF)
option(DISABLE_FRAME_LARGER_THAN_WARNING "Disable warning if the size of a function frame is large" OFF)
option(EXTCAP_ANDROIDDUMP_LIBPCAP "Build androiddump using libpcap" OFF)

View File

@ -15,6 +15,23 @@
#include <strutil.h>
#include <wsutil/ws_assert.h>
#ifdef WS_DEBUG_UTF_8
static inline void
string_validate_utf8(fvalue_t *fv)
{
if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) {
ws_warning("String fvalues must use a valid UTF-8 encoding."
" This string has been sanitized to look like this: %s",
wmem_strbuf_get_str(fv->value.strbuf));
}
}
#define CHECK_UTF_8(fv) string_validate_utf8(fv)
#else /* !WS_DEBUG_UTF_8 */
#define CHECK_UTF_8(fv) (void)(fv)
#endif /* WS_DEBUG_UTF_8 */
static void
string_fvalue_new(fvalue_t *fv)
{
@ -42,6 +59,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value)
string_fvalue_free(fv);
fv->value.strbuf = value;
CHECK_UTF_8(fv);
}
static char *
@ -73,20 +91,19 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_)
fv->value.strbuf = wmem_strbuf_new_len(NULL, s, len);
else
fv->value.strbuf = wmem_strbuf_new(NULL, s);
CHECK_UTF_8(fv);
return TRUE;
}
static gboolean
val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg _U_)
val_from_literal(fvalue_t *fv, const char *s, gboolean allow_partial_value _U_, gchar **err_msg)
{
/* Just turn it into a string */
/* XXX Should probably be a syntax error instead. It's more user-friendly to ask the
* user to be explicit about the meaning of an unquoted literal than them trying to figure out
* why a valid filter expression is giving wrong results. */
string_fvalue_free(fv);
fv->value.strbuf = wmem_strbuf_new(NULL, s);
return TRUE;
return val_from_string(fv, s, 0, err_msg);
}
static gboolean

View File

@ -5002,17 +5002,6 @@ proto_tree_add_string(proto_tree *tree, int hfindex, tvbuff_t *tvb, gint start,
pi = proto_tree_add_pi(tree, hfinfo, tvb, start, &length);
DISSECTOR_ASSERT(length >= 0);
/* XXX: We could have a preference or a define to turn off
* validation (which is slightly slow) and trust subdissectors
* to validate strings passed in.
*/
if (value) {
if (!g_utf8_validate(value, -1, NULL)) {
proto_tree_set_string(PNODE_FINFO(pi), get_utf_8_string(PNODE_POOL(tree), value, (int)(strlen(value))));
/* We could turn this assertion off, and just sanitize */
DISSECTOR_ASSERT_HINT(0, "Unsanitized UTF-8 string");
}
}
proto_tree_set_string(PNODE_FINFO(pi), value);
return pi;
@ -5062,6 +5051,7 @@ static void
proto_tree_set_string(field_info *fi, const char* value)
{
if (value) {
/* String must be valid UTF-8. It is sanitized otherwise (if enabled at compile time). */
fvalue_set_string(&fi->value, value);
} else {
/*

View File

@ -215,6 +215,10 @@ get_compiled_version_info(gather_feature_func gather_compile)
g_string_append(str, ", without assertions");
#endif
#ifdef WS_DEBUG_UTF_8
g_string_append(str, ", with UTF-8 validation");
#endif
g_string_append(str, ".");
end_string(str);
free_features(&l);

View File

@ -331,6 +331,27 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf)
wmem_free(strbuf->allocator, strbuf);
}
bool
wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
{
if (g_utf8_validate(strbuf->str, -1, NULL)) {
return false;
}
/* Sanitize the contents to a temporary string. */
char *tmp = g_utf8_make_valid(strbuf->str, -1);
/* Reset the strbuf, keeping the backing memory allocation */
*strbuf->str = '\0';
strbuf->len = 0;
/* Copy the temporary string to the strbuf. */
wmem_strbuf_append(strbuf, tmp);
g_free(tmp);
return true;
}
/*
* Editor modelines - https://www.wireshark.org/tools/modelines.html
*

View File

@ -12,8 +12,7 @@
#ifndef __WMEM_STRBUF_H__
#define __WMEM_STRBUF_H__
#include <string.h>
#include <glib.h>
#include <wireshark.h>
#include "wmem_core.h"
@ -138,6 +137,17 @@ WS_DLL_PUBLIC
void
wmem_strbuf_destroy(wmem_strbuf_t *strbuf);
/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed,
* by replacing encoding errors with unicode replacement character. This function is
* intended for debugging purposes and is not optimized for speed.
*
* @param strbuf the strbuf to validate
* @return true if the string was sanitized, false otherwise
*/
WS_DLL_PUBLIC
bool
wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf);
/** @}
* @} */