wsutil: Improve UTF-8 APIs for debugging

In particular add an UTF-8 specific wslog API that should
make it easier to interpret invalid encodings.
This commit is contained in:
João Valverde 2022-10-03 10:49:07 +01:00
parent f9aba04431
commit 51320ae59b
7 changed files with 167 additions and 34 deletions

View File

@ -14,23 +14,7 @@
#include <strutil.h>
#include <wsutil/ws_assert.h>
#ifdef WS_DEBUG_UTF_8
static inline void
string_validate_utf8(fvalue_t *fv)
{
if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) {
ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
"String fvalues must use a valid UTF-8 encoding."
" This string has been sanitized to look like this: %s",
wmem_strbuf_get_str(fv->value.strbuf));
}
}
#define CHECK_UTF_8(fv) string_validate_utf8(fv)
#else /* !WS_DEBUG_UTF_8 */
#define CHECK_UTF_8(fv) (void)(fv)
#endif /* WS_DEBUG_UTF_8 */
#include <wsutil/unicode-utils.h>
static void
@ -60,7 +44,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value)
string_fvalue_free(fv);
fv->value.strbuf = value;
CHECK_UTF_8(fv);
WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf);
}
static char *
@ -93,7 +77,7 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_)
else
fv->value.strbuf = wmem_strbuf_new(NULL, s);
CHECK_UTF_8(fv);
WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf);
return TRUE;
}

View File

@ -28,6 +28,37 @@
extern "C" {
#endif
#ifdef WS_DEBUG_UTF_8
#define DEBUG_UTF_8_ENABLED true
#else
#define DEBUG_UTF_8_ENABLED false
#endif
#define _CHECK_UTF_8(level, str, len) \
do { \
const char *__uni_endptr; \
if (DEBUG_UTF_8_ENABLED && \
!g_utf8_validate(str, len, &__uni_endptr)) { \
ws_log_utf8(str, len, __uni_endptr); \
} \
} while (0)
#define WS_UTF_8_CHECK(str, len) \
_CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len)
#define WS_UTF_8_DEBUG_HERE(str, len) \
_CHECK_UTF_8(LOG_LEVEL_ECHO, str, len)
#define WS_UTF_8_SANITIZE_STRBUF(buf) \
do { \
const char *__uni_endptr; \
if (!wmem_strbuf_utf8_validate(buf, &__uni_endptr)) { \
ws_log_utf8(buf->str, buf->len, __uni_endptr); \
wmem_strbuf_utf8_make_valid(buf); \
} \
} while (0)
WS_DLL_PUBLIC
int ws_utf8_char_len(guint8 ch);

View File

@ -252,6 +252,23 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
}
}
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
void
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t ch)
{
wmem_strbuf_grow(strbuf, 4);
if (!strbuf->max_size || WMEM_STRBUF_ROOM(strbuf) >= 4) {
strbuf->str[strbuf->len++] = '\\';
strbuf->str[strbuf->len++] = 'x';
strbuf->str[strbuf->len++] = hex[(ch >> 4) & 0xF];
strbuf->str[strbuf->len++] = hex[(ch >> 0) & 0xF];
strbuf->str[strbuf->len] = '\0';
}
}
void
wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len)
{
@ -332,14 +349,16 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf)
}
bool
wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr)
{
if (g_utf8_validate(strbuf->str, -1, NULL)) {
return false;
}
return g_utf8_validate(strbuf->str, strbuf->len, endptr);
}
void
wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf)
{
/* Sanitize the contents to a temporary string. */
char *tmp = g_utf8_make_valid(strbuf->str, -1);
char *tmp = g_utf8_make_valid(strbuf->str, strbuf->len);
/* Reset the strbuf, keeping the backing memory allocation */
*strbuf->str = '\0';
@ -348,8 +367,6 @@ wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
/* Copy the temporary string to the strbuf. */
wmem_strbuf_append(strbuf, tmp);
g_free(tmp);
return true;
}
/*

View File

@ -104,6 +104,10 @@ WS_DLL_PUBLIC
void
wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);
WS_DLL_PUBLIC
void
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);
WS_DLL_PUBLIC
void
wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len);
@ -137,16 +141,13 @@ WS_DLL_PUBLIC
void
wmem_strbuf_destroy(wmem_strbuf_t *strbuf);
/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed,
* by replacing encoding errors with unicode replacement character. This function is
* intended for debugging purposes and is not optimized for speed.
*
* @param strbuf the strbuf to validate
* @return true if the string was sanitized, false otherwise
*/
WS_DLL_PUBLIC
bool
wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf);
wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr);
WS_DLL_PUBLIC
void
wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf);
/** @}
* @} */

View File

@ -58,6 +58,17 @@ extern "C" {
#define ws_assert_streq(s1, s2) \
ws_assert((s1) && (s2) && strcmp((s1), (s2)) == 0)
#define ws_assert_utf8(str, len) \
do { \
const char *__assert_endptr; \
if (_ASSERT_ENABLED && \
!g_utf8_validate(str, len, &__assert_endptr)) { \
ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_ERROR, \
__FILE__, __LINE__, __func__, \
str, len, __assert_endptr); \
} \
} while (0)
/*
* We don't want to disable ws_assert_not_reached() with WS_DISABLE_ASSERT.
* That would blast compiler warnings everywhere for no benefit, not

View File

@ -1124,6 +1124,78 @@ void ws_log_write_always_full(const char *domain, enum ws_log_level level,
}
static char *
make_utf8_display(const char *src, size_t src_length, size_t good_length)
{
wmem_strbuf_t *buf;
char ch;
size_t offset = 0;
buf = wmem_strbuf_new(NULL, NULL);
for (size_t pos = 0; pos < src_length; pos++) {
ch = src[pos];
if (pos < good_length) {
if (g_ascii_isalnum(ch) || ch == ' ') {
wmem_strbuf_append_c(buf, ch);
offset += 1;
}
else {
wmem_strbuf_append_hex(buf, ch);
offset += 4;
}
}
else {
wmem_strbuf_append_hex(buf, ch);
}
}
wmem_strbuf_append_c(buf, '\n');
for (size_t pos = 0; pos < offset; pos++) {
wmem_strbuf_append_c(buf, ' ');
}
wmem_strbuf_append(buf, "^^^^");
for (size_t pos = good_length + 1; pos < src_length; pos++) {
wmem_strbuf_append(buf, "~~~~");
}
return wmem_strbuf_finalize(buf);
}
void ws_log_utf8_full(const char *domain, enum ws_log_level level,
const char *file, long line, const char *func,
const char *string, ssize_t _length, const char *endptr)
{
if (!ws_log_msg_is_active(domain, level))
return;
char *display;
size_t length;
size_t good_length;
if (_length < 0)
length = strlen(string);
else
length = _length;
if (endptr == NULL || endptr < string) {
/* Find the pointer to the first invalid byte. */
if (g_utf8_validate(string, length, &endptr)) {
/* Valid string - should not happen. */
return;
}
}
good_length = endptr - string;
display = make_utf8_display(string, length, good_length);
ws_log_write_always_full(domain, level, file, line, func,
"Invalid UTF-8 at address %p offset %zu (length = %zu):\n%s",
string, good_length, length, display);
g_free(display);
}
void ws_log_buffer_full(const char *domain, enum ws_log_level level,
const char *file, long line, const char *func,
const uint8_t *ptr, size_t size, size_t max_bytes_len,

View File

@ -22,6 +22,7 @@
#include <ws_symbol_export.h>
#include <ws_attributes.h>
#include <ws_log_defs.h>
#include <ws_posix_compat.h>
#ifdef WS_LOG_DOMAIN
#define _LOG_DOMAIN WS_LOG_DOMAIN
@ -368,6 +369,22 @@ void ws_log_fatal_full(const char *domain, enum ws_log_level level,
_LOG_FULL(true, LOG_LEVEL_ECHO, __VA_ARGS__)
WS_DLL_PUBLIC
void ws_log_utf8_full(const char *domain, enum ws_log_level level,
const char *file, long line, const char *func,
const char *string, ssize_t length, const char *endptr);
#define ws_log_utf8(str, len, endptr) \
do { \
if (_LOG_DEBUG_ENABLED) { \
ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, \
__FILE__, __LINE__, __func__, \
str, len, endptr); \
} \
} while (0)
/** This function is called to log a buffer (bytes array).
*
* Accepts an optional 'msg' argument to provide a description.