wsutil: Improve UTF-8 APIs for debugging

In particular add an UTF-8 specific wslog API that should make it easier to interpret invalid encodings.
2022-10-03 10:49:07 +01:00 · 2022-10-03 10:49:07 +01:00 · 51320ae59b
parent f9aba04431
commit 51320ae59b
7 changed files with 167 additions and 34 deletions
--- a/epan/ftypes/ftype-string.c
+++ b/epan/ftypes/ftype-string.c
@ -14,23 +14,7 @@

 #include <strutil.h>
 #include <wsutil/ws_assert.h>
-
-
-#ifdef WS_DEBUG_UTF_8
-static inline void
-string_validate_utf8(fvalue_t *fv)
-{
-	if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) {
-		ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
-				"String fvalues must use a valid UTF-8 encoding."
-				" This string has been sanitized to look like this: %s",
-				wmem_strbuf_get_str(fv->value.strbuf));
-	}
-}
-#define CHECK_UTF_8(fv) string_validate_utf8(fv)
-#else /* !WS_DEBUG_UTF_8 */
-#define CHECK_UTF_8(fv)  (void)(fv)
-#endif /* WS_DEBUG_UTF_8 */
+#include <wsutil/unicode-utils.h>


 static void
@ -60,7 +44,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value)
 	string_fvalue_free(fv);

 	fv->value.strbuf = value;
-	CHECK_UTF_8(fv);
+	WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf);
 }

 static char *
@ -93,7 +77,7 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_)
 	else
 		fv->value.strbuf = wmem_strbuf_new(NULL, s);

-	CHECK_UTF_8(fv);
+	WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf);
 	return TRUE;
 }

--- a/wsutil/unicode-utils.h
+++ b/wsutil/unicode-utils.h
@ -28,6 +28,37 @@
 extern "C" {
 #endif

+#ifdef WS_DEBUG_UTF_8
+#define DEBUG_UTF_8_ENABLED true
+#else
+#define DEBUG_UTF_8_ENABLED false
+#endif
+
+#define _CHECK_UTF_8(level, str, len) \
+  do {                                                                \
+    const char *__uni_endptr;                                         \
+    if (DEBUG_UTF_8_ENABLED &&                                        \
+                        !g_utf8_validate(str, len, &__uni_endptr)) {  \
+      ws_log_utf8(str, len, __uni_endptr);                            \
+    }                                                                 \
+  } while (0)
+
+#define WS_UTF_8_CHECK(str, len) \
+  _CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len)
+
+#define WS_UTF_8_DEBUG_HERE(str, len) \
+  _CHECK_UTF_8(LOG_LEVEL_ECHO, str, len)
+
+#define WS_UTF_8_SANITIZE_STRBUF(buf) \
+  do {                                                      \
+    const char *__uni_endptr;                               \
+    if (!wmem_strbuf_utf8_validate(buf, &__uni_endptr)) {   \
+      ws_log_utf8(buf->str, buf->len, __uni_endptr);        \
+      wmem_strbuf_utf8_make_valid(buf);                     \
+    }                                                       \
+  } while (0)
+
+
 WS_DLL_PUBLIC
 int ws_utf8_char_len(guint8 ch);

--- a/wsutil/wmem/wmem_strbuf.c
+++ b/wsutil/wmem/wmem_strbuf.c
@ -252,6 +252,23 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
    }
 }

+static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
+                              '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+
+void
+wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t ch)
+{
+    wmem_strbuf_grow(strbuf, 4);
+
+    if (!strbuf->max_size || WMEM_STRBUF_ROOM(strbuf) >= 4) {
+        strbuf->str[strbuf->len++] = '\\';
+        strbuf->str[strbuf->len++] = 'x';
+        strbuf->str[strbuf->len++] = hex[(ch >> 4) & 0xF];
+        strbuf->str[strbuf->len++] = hex[(ch >> 0) & 0xF];
+        strbuf->str[strbuf->len] = '\0';
+    }
+}
+
 void
 wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len)
 {
@ -332,14 +349,16 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf)
 }

 bool
-wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
+wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr)
 {
-    if (g_utf8_validate(strbuf->str, -1, NULL)) {
-        return false;
-    }
+    return g_utf8_validate(strbuf->str, strbuf->len, endptr);
+}

+void
+wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf)
+{
    /* Sanitize the contents to a temporary string. */
-    char *tmp = g_utf8_make_valid(strbuf->str, -1);
+    char *tmp = g_utf8_make_valid(strbuf->str, strbuf->len);

    /* Reset the strbuf, keeping the backing memory allocation */
    *strbuf->str = '\0';
@ -348,8 +367,6 @@ wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
    /* Copy the temporary string to the strbuf. */
    wmem_strbuf_append(strbuf, tmp);
    g_free(tmp);
-
-    return true;
 }

 /*
--- a/wsutil/wmem/wmem_strbuf.h
+++ b/wsutil/wmem/wmem_strbuf.h
@ -104,6 +104,10 @@ WS_DLL_PUBLIC
 void
 wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);

+WS_DLL_PUBLIC
+void
+wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);
+
 WS_DLL_PUBLIC
 void
 wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len);
@ -137,16 +141,13 @@ WS_DLL_PUBLIC
 void
 wmem_strbuf_destroy(wmem_strbuf_t *strbuf);

-/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed,
- * by replacing encoding errors with unicode replacement character. This function is
- * intended for debugging purposes and is not optimized for speed.
- *
- * @param strbuf the strbuf to validate
- * @return true if the string was sanitized, false otherwise
- */
 WS_DLL_PUBLIC
 bool
-wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf);
+wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr);
+
+WS_DLL_PUBLIC
+void
+wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf);

 /**   @}
 *  @} */
--- a/wsutil/ws_assert.h
+++ b/wsutil/ws_assert.h
@ -58,6 +58,17 @@ extern "C" {
 #define ws_assert_streq(s1, s2) \
        ws_assert((s1) && (s2) && strcmp((s1), (s2)) == 0)

+#define ws_assert_utf8(str, len) \
+        do {                                                            \
+            const char *__assert_endptr;                                \
+            if (_ASSERT_ENABLED &&                                      \
+                        !g_utf8_validate(str, len, &__assert_endptr)) { \
+                ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_ERROR,     \
+                                    __FILE__, __LINE__, __func__,       \
+                                    str, len, __assert_endptr);         \
+            }                                                           \
+        } while (0)
+
 /*
 * We don't want to disable ws_assert_not_reached() with WS_DISABLE_ASSERT.
 * That would blast compiler warnings everywhere for no benefit, not
--- a/wsutil/wslog.c
+++ b/wsutil/wslog.c
@ -1124,6 +1124,78 @@ void ws_log_write_always_full(const char *domain, enum ws_log_level level,
 }


+static char *
+make_utf8_display(const char *src, size_t src_length, size_t good_length)
+{
+    wmem_strbuf_t *buf;
+    char ch;
+    size_t offset = 0;
+
+    buf = wmem_strbuf_new(NULL, NULL);
+
+    for (size_t pos = 0; pos < src_length; pos++) {
+        ch = src[pos];
+        if (pos < good_length) {
+            if (g_ascii_isalnum(ch) || ch == ' ') {
+                wmem_strbuf_append_c(buf, ch);
+                offset += 1;
+            }
+            else {
+                wmem_strbuf_append_hex(buf, ch);
+                offset += 4;
+            }
+        }
+        else {
+            wmem_strbuf_append_hex(buf, ch);
+        }
+    }
+    wmem_strbuf_append_c(buf, '\n');
+    for (size_t pos = 0; pos < offset; pos++) {
+        wmem_strbuf_append_c(buf, ' ');
+    }
+    wmem_strbuf_append(buf, "^^^^");
+    for (size_t pos = good_length + 1; pos < src_length; pos++) {
+        wmem_strbuf_append(buf, "~~~~");
+    }
+    return wmem_strbuf_finalize(buf);
+}
+
+
+void ws_log_utf8_full(const char *domain, enum ws_log_level level,
+                    const char *file, long line, const char *func,
+                    const char *string, ssize_t _length, const char *endptr)
+{
+    if (!ws_log_msg_is_active(domain, level))
+        return;
+
+    char *display;
+    size_t length;
+    size_t good_length;
+
+    if (_length < 0)
+        length = strlen(string);
+    else
+        length = _length;
+
+    if (endptr == NULL || endptr < string) {
+        /* Find the pointer to the first invalid byte. */
+        if (g_utf8_validate(string, length, &endptr)) {
+            /* Valid string - should not happen. */
+            return;
+        }
+    }
+    good_length = endptr - string;
+
+    display = make_utf8_display(string, length, good_length);
+
+    ws_log_write_always_full(domain, level, file, line, func,
+            "Invalid UTF-8 at address %p offset %zu (length = %zu):\n%s",
+            string, good_length, length, display);
+
+    g_free(display);
+}
+
+
 void ws_log_buffer_full(const char *domain, enum ws_log_level level,
                    const char *file, long line, const char *func,
                    const uint8_t *ptr, size_t size,  size_t max_bytes_len,
--- a/wsutil/wslog.h
+++ b/wsutil/wslog.h
@ -22,6 +22,7 @@
 #include <ws_symbol_export.h>
 #include <ws_attributes.h>
 #include <ws_log_defs.h>
+#include <ws_posix_compat.h>

 #ifdef WS_LOG_DOMAIN
 #define _LOG_DOMAIN WS_LOG_DOMAIN
@ -368,6 +369,22 @@ void ws_log_fatal_full(const char *domain, enum ws_log_level level,
        _LOG_FULL(true, LOG_LEVEL_ECHO, __VA_ARGS__)


+WS_DLL_PUBLIC
+void ws_log_utf8_full(const char *domain, enum ws_log_level level,
+                    const char *file, long line, const char *func,
+                    const char *string, ssize_t length, const char *endptr);
+
+
+#define ws_log_utf8(str, len, endptr) \
+    do {                                                        \
+        if (_LOG_DEBUG_ENABLED) {                               \
+            ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, \
+                                __FILE__, __LINE__, __func__,   \
+                                str, len, endptr);              \
+        }                                                       \
+    } while (0)
+
+
 /** This function is called to log a buffer (bytes array).
 *
 * Accepts an optional 'msg' argument to provide a description.