From 51320ae59b80f16a60300e8041c225791f5af409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Valverde?= Date: Mon, 3 Oct 2022 10:49:07 +0100 Subject: [PATCH] wsutil: Improve UTF-8 APIs for debugging In particular add an UTF-8 specific wslog API that should make it easier to interpret invalid encodings. --- epan/ftypes/ftype-string.c | 22 ++---------- wsutil/unicode-utils.h | 31 ++++++++++++++++ wsutil/wmem/wmem_strbuf.c | 31 ++++++++++++---- wsutil/wmem/wmem_strbuf.h | 17 ++++----- wsutil/ws_assert.h | 11 ++++++ wsutil/wslog.c | 72 ++++++++++++++++++++++++++++++++++++++ wsutil/wslog.h | 17 +++++++++ 7 files changed, 167 insertions(+), 34 deletions(-) diff --git a/epan/ftypes/ftype-string.c b/epan/ftypes/ftype-string.c index 649748ce8c..71da46b8f6 100644 --- a/epan/ftypes/ftype-string.c +++ b/epan/ftypes/ftype-string.c @@ -14,23 +14,7 @@ #include #include - - -#ifdef WS_DEBUG_UTF_8 -static inline void -string_validate_utf8(fvalue_t *fv) -{ - if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) { - ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__, - "String fvalues must use a valid UTF-8 encoding." - " This string has been sanitized to look like this: %s", - wmem_strbuf_get_str(fv->value.strbuf)); - } -} -#define CHECK_UTF_8(fv) string_validate_utf8(fv) -#else /* !WS_DEBUG_UTF_8 */ -#define CHECK_UTF_8(fv) (void)(fv) -#endif /* WS_DEBUG_UTF_8 */ +#include static void @@ -60,7 +44,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value) string_fvalue_free(fv); fv->value.strbuf = value; - CHECK_UTF_8(fv); + WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf); } static char * @@ -93,7 +77,7 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_) else fv->value.strbuf = wmem_strbuf_new(NULL, s); - CHECK_UTF_8(fv); + WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf); return TRUE; } diff --git a/wsutil/unicode-utils.h b/wsutil/unicode-utils.h index bdbc48c380..2ffb90ad32 100644 --- a/wsutil/unicode-utils.h +++ b/wsutil/unicode-utils.h @@ -28,6 +28,37 @@ extern "C" { #endif +#ifdef WS_DEBUG_UTF_8 +#define DEBUG_UTF_8_ENABLED true +#else +#define DEBUG_UTF_8_ENABLED false +#endif + +#define _CHECK_UTF_8(level, str, len) \ + do { \ + const char *__uni_endptr; \ + if (DEBUG_UTF_8_ENABLED && \ + !g_utf8_validate(str, len, &__uni_endptr)) { \ + ws_log_utf8(str, len, __uni_endptr); \ + } \ + } while (0) + +#define WS_UTF_8_CHECK(str, len) \ + _CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len) + +#define WS_UTF_8_DEBUG_HERE(str, len) \ + _CHECK_UTF_8(LOG_LEVEL_ECHO, str, len) + +#define WS_UTF_8_SANITIZE_STRBUF(buf) \ + do { \ + const char *__uni_endptr; \ + if (!wmem_strbuf_utf8_validate(buf, &__uni_endptr)) { \ + ws_log_utf8(buf->str, buf->len, __uni_endptr); \ + wmem_strbuf_utf8_make_valid(buf); \ + } \ + } while (0) + + WS_DLL_PUBLIC int ws_utf8_char_len(guint8 ch); diff --git a/wsutil/wmem/wmem_strbuf.c b/wsutil/wmem/wmem_strbuf.c index 00edf26264..bb1ea41c83 100644 --- a/wsutil/wmem/wmem_strbuf.c +++ b/wsutil/wmem/wmem_strbuf.c @@ -252,6 +252,23 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c) } } +static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; + +void +wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t ch) +{ + wmem_strbuf_grow(strbuf, 4); + + if (!strbuf->max_size || WMEM_STRBUF_ROOM(strbuf) >= 4) { + strbuf->str[strbuf->len++] = '\\'; + strbuf->str[strbuf->len++] = 'x'; + strbuf->str[strbuf->len++] = hex[(ch >> 4) & 0xF]; + strbuf->str[strbuf->len++] = hex[(ch >> 0) & 0xF]; + strbuf->str[strbuf->len] = '\0'; + } +} + void wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len) { @@ -332,14 +349,16 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf) } bool -wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf) +wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr) { - if (g_utf8_validate(strbuf->str, -1, NULL)) { - return false; - } + return g_utf8_validate(strbuf->str, strbuf->len, endptr); +} +void +wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf) +{ /* Sanitize the contents to a temporary string. */ - char *tmp = g_utf8_make_valid(strbuf->str, -1); + char *tmp = g_utf8_make_valid(strbuf->str, strbuf->len); /* Reset the strbuf, keeping the backing memory allocation */ *strbuf->str = '\0'; @@ -348,8 +367,6 @@ wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf) /* Copy the temporary string to the strbuf. */ wmem_strbuf_append(strbuf, tmp); g_free(tmp); - - return true; } /* diff --git a/wsutil/wmem/wmem_strbuf.h b/wsutil/wmem/wmem_strbuf.h index c691eeb160..e9acc215c7 100644 --- a/wsutil/wmem/wmem_strbuf.h +++ b/wsutil/wmem/wmem_strbuf.h @@ -104,6 +104,10 @@ WS_DLL_PUBLIC void wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c); +WS_DLL_PUBLIC +void +wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t); + WS_DLL_PUBLIC void wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len); @@ -137,16 +141,13 @@ WS_DLL_PUBLIC void wmem_strbuf_destroy(wmem_strbuf_t *strbuf); -/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed, - * by replacing encoding errors with unicode replacement character. This function is - * intended for debugging purposes and is not optimized for speed. - * - * @param strbuf the strbuf to validate - * @return true if the string was sanitized, false otherwise - */ WS_DLL_PUBLIC bool -wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf); +wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr); + +WS_DLL_PUBLIC +void +wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf); /** @} * @} */ diff --git a/wsutil/ws_assert.h b/wsutil/ws_assert.h index f65293cdd2..4d6531c735 100644 --- a/wsutil/ws_assert.h +++ b/wsutil/ws_assert.h @@ -58,6 +58,17 @@ extern "C" { #define ws_assert_streq(s1, s2) \ ws_assert((s1) && (s2) && strcmp((s1), (s2)) == 0) +#define ws_assert_utf8(str, len) \ + do { \ + const char *__assert_endptr; \ + if (_ASSERT_ENABLED && \ + !g_utf8_validate(str, len, &__assert_endptr)) { \ + ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_ERROR, \ + __FILE__, __LINE__, __func__, \ + str, len, __assert_endptr); \ + } \ + } while (0) + /* * We don't want to disable ws_assert_not_reached() with WS_DISABLE_ASSERT. * That would blast compiler warnings everywhere for no benefit, not diff --git a/wsutil/wslog.c b/wsutil/wslog.c index 139c5a6706..3f0a53ca88 100644 --- a/wsutil/wslog.c +++ b/wsutil/wslog.c @@ -1124,6 +1124,78 @@ void ws_log_write_always_full(const char *domain, enum ws_log_level level, } +static char * +make_utf8_display(const char *src, size_t src_length, size_t good_length) +{ + wmem_strbuf_t *buf; + char ch; + size_t offset = 0; + + buf = wmem_strbuf_new(NULL, NULL); + + for (size_t pos = 0; pos < src_length; pos++) { + ch = src[pos]; + if (pos < good_length) { + if (g_ascii_isalnum(ch) || ch == ' ') { + wmem_strbuf_append_c(buf, ch); + offset += 1; + } + else { + wmem_strbuf_append_hex(buf, ch); + offset += 4; + } + } + else { + wmem_strbuf_append_hex(buf, ch); + } + } + wmem_strbuf_append_c(buf, '\n'); + for (size_t pos = 0; pos < offset; pos++) { + wmem_strbuf_append_c(buf, ' '); + } + wmem_strbuf_append(buf, "^^^^"); + for (size_t pos = good_length + 1; pos < src_length; pos++) { + wmem_strbuf_append(buf, "~~~~"); + } + return wmem_strbuf_finalize(buf); +} + + +void ws_log_utf8_full(const char *domain, enum ws_log_level level, + const char *file, long line, const char *func, + const char *string, ssize_t _length, const char *endptr) +{ + if (!ws_log_msg_is_active(domain, level)) + return; + + char *display; + size_t length; + size_t good_length; + + if (_length < 0) + length = strlen(string); + else + length = _length; + + if (endptr == NULL || endptr < string) { + /* Find the pointer to the first invalid byte. */ + if (g_utf8_validate(string, length, &endptr)) { + /* Valid string - should not happen. */ + return; + } + } + good_length = endptr - string; + + display = make_utf8_display(string, length, good_length); + + ws_log_write_always_full(domain, level, file, line, func, + "Invalid UTF-8 at address %p offset %zu (length = %zu):\n%s", + string, good_length, length, display); + + g_free(display); +} + + void ws_log_buffer_full(const char *domain, enum ws_log_level level, const char *file, long line, const char *func, const uint8_t *ptr, size_t size, size_t max_bytes_len, diff --git a/wsutil/wslog.h b/wsutil/wslog.h index e194604e13..9d7208fa75 100644 --- a/wsutil/wslog.h +++ b/wsutil/wslog.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef WS_LOG_DOMAIN #define _LOG_DOMAIN WS_LOG_DOMAIN @@ -368,6 +369,22 @@ void ws_log_fatal_full(const char *domain, enum ws_log_level level, _LOG_FULL(true, LOG_LEVEL_ECHO, __VA_ARGS__) +WS_DLL_PUBLIC +void ws_log_utf8_full(const char *domain, enum ws_log_level level, + const char *file, long line, const char *func, + const char *string, ssize_t length, const char *endptr); + + +#define ws_log_utf8(str, len, endptr) \ + do { \ + if (_LOG_DEBUG_ENABLED) { \ + ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, \ + __FILE__, __LINE__, __func__, \ + str, len, endptr); \ + } \ + } while (0) + + /** This function is called to log a buffer (bytes array). * * Accepts an optional 'msg' argument to provide a description.