wsutil: Improve UTF-8 APIs for debugging
In particular add an UTF-8 specific wslog API that should make it easier to interpret invalid encodings.
This commit is contained in:
parent
f9aba04431
commit
51320ae59b
|
@ -14,23 +14,7 @@
|
|||
|
||||
#include <strutil.h>
|
||||
#include <wsutil/ws_assert.h>
|
||||
|
||||
|
||||
#ifdef WS_DEBUG_UTF_8
|
||||
static inline void
|
||||
string_validate_utf8(fvalue_t *fv)
|
||||
{
|
||||
if (wmem_strbuf_sanitize_utf8(fv->value.strbuf)) {
|
||||
ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
|
||||
"String fvalues must use a valid UTF-8 encoding."
|
||||
" This string has been sanitized to look like this: %s",
|
||||
wmem_strbuf_get_str(fv->value.strbuf));
|
||||
}
|
||||
}
|
||||
#define CHECK_UTF_8(fv) string_validate_utf8(fv)
|
||||
#else /* !WS_DEBUG_UTF_8 */
|
||||
#define CHECK_UTF_8(fv) (void)(fv)
|
||||
#endif /* WS_DEBUG_UTF_8 */
|
||||
#include <wsutil/unicode-utils.h>
|
||||
|
||||
|
||||
static void
|
||||
|
@ -60,7 +44,7 @@ string_fvalue_set_strbuf(fvalue_t *fv, wmem_strbuf_t *value)
|
|||
string_fvalue_free(fv);
|
||||
|
||||
fv->value.strbuf = value;
|
||||
CHECK_UTF_8(fv);
|
||||
WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf);
|
||||
}
|
||||
|
||||
static char *
|
||||
|
@ -93,7 +77,7 @@ val_from_string(fvalue_t *fv, const char *s, size_t len, gchar **err_msg _U_)
|
|||
else
|
||||
fv->value.strbuf = wmem_strbuf_new(NULL, s);
|
||||
|
||||
CHECK_UTF_8(fv);
|
||||
WS_UTF_8_SANITIZE_STRBUF(fv->value.strbuf);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,37 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef WS_DEBUG_UTF_8
|
||||
#define DEBUG_UTF_8_ENABLED true
|
||||
#else
|
||||
#define DEBUG_UTF_8_ENABLED false
|
||||
#endif
|
||||
|
||||
#define _CHECK_UTF_8(level, str, len) \
|
||||
do { \
|
||||
const char *__uni_endptr; \
|
||||
if (DEBUG_UTF_8_ENABLED && \
|
||||
!g_utf8_validate(str, len, &__uni_endptr)) { \
|
||||
ws_log_utf8(str, len, __uni_endptr); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define WS_UTF_8_CHECK(str, len) \
|
||||
_CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len)
|
||||
|
||||
#define WS_UTF_8_DEBUG_HERE(str, len) \
|
||||
_CHECK_UTF_8(LOG_LEVEL_ECHO, str, len)
|
||||
|
||||
#define WS_UTF_8_SANITIZE_STRBUF(buf) \
|
||||
do { \
|
||||
const char *__uni_endptr; \
|
||||
if (!wmem_strbuf_utf8_validate(buf, &__uni_endptr)) { \
|
||||
ws_log_utf8(buf->str, buf->len, __uni_endptr); \
|
||||
wmem_strbuf_utf8_make_valid(buf); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
int ws_utf8_char_len(guint8 ch);
|
||||
|
||||
|
|
|
@ -252,6 +252,23 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
|
|||
}
|
||||
}
|
||||
|
||||
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
|
||||
|
||||
void
|
||||
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t ch)
|
||||
{
|
||||
wmem_strbuf_grow(strbuf, 4);
|
||||
|
||||
if (!strbuf->max_size || WMEM_STRBUF_ROOM(strbuf) >= 4) {
|
||||
strbuf->str[strbuf->len++] = '\\';
|
||||
strbuf->str[strbuf->len++] = 'x';
|
||||
strbuf->str[strbuf->len++] = hex[(ch >> 4) & 0xF];
|
||||
strbuf->str[strbuf->len++] = hex[(ch >> 0) & 0xF];
|
||||
strbuf->str[strbuf->len] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len)
|
||||
{
|
||||
|
@ -332,14 +349,16 @@ wmem_strbuf_destroy(wmem_strbuf_t *strbuf)
|
|||
}
|
||||
|
||||
bool
|
||||
wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
|
||||
wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr)
|
||||
{
|
||||
if (g_utf8_validate(strbuf->str, -1, NULL)) {
|
||||
return false;
|
||||
}
|
||||
return g_utf8_validate(strbuf->str, strbuf->len, endptr);
|
||||
}
|
||||
|
||||
void
|
||||
wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf)
|
||||
{
|
||||
/* Sanitize the contents to a temporary string. */
|
||||
char *tmp = g_utf8_make_valid(strbuf->str, -1);
|
||||
char *tmp = g_utf8_make_valid(strbuf->str, strbuf->len);
|
||||
|
||||
/* Reset the strbuf, keeping the backing memory allocation */
|
||||
*strbuf->str = '\0';
|
||||
|
@ -348,8 +367,6 @@ wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf)
|
|||
/* Copy the temporary string to the strbuf. */
|
||||
wmem_strbuf_append(strbuf, tmp);
|
||||
g_free(tmp);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -104,6 +104,10 @@ WS_DLL_PUBLIC
|
|||
void
|
||||
wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
void
|
||||
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
void
|
||||
wmem_strbuf_truncate(wmem_strbuf_t *strbuf, const size_t len);
|
||||
|
@ -137,16 +141,13 @@ WS_DLL_PUBLIC
|
|||
void
|
||||
wmem_strbuf_destroy(wmem_strbuf_t *strbuf);
|
||||
|
||||
/** Check the UTF-8 encoded strbuf for validity and sanitize the contents if needed,
|
||||
* by replacing encoding errors with unicode replacement character. This function is
|
||||
* intended for debugging purposes and is not optimized for speed.
|
||||
*
|
||||
* @param strbuf the strbuf to validate
|
||||
* @return true if the string was sanitized, false otherwise
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
bool
|
||||
wmem_strbuf_sanitize_utf8(wmem_strbuf_t *strbuf);
|
||||
wmem_strbuf_utf8_validate(wmem_strbuf_t *strbuf, const char **endptr);
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
void
|
||||
wmem_strbuf_utf8_make_valid(wmem_strbuf_t *strbuf);
|
||||
|
||||
/** @}
|
||||
* @} */
|
||||
|
|
|
@ -58,6 +58,17 @@ extern "C" {
|
|||
#define ws_assert_streq(s1, s2) \
|
||||
ws_assert((s1) && (s2) && strcmp((s1), (s2)) == 0)
|
||||
|
||||
#define ws_assert_utf8(str, len) \
|
||||
do { \
|
||||
const char *__assert_endptr; \
|
||||
if (_ASSERT_ENABLED && \
|
||||
!g_utf8_validate(str, len, &__assert_endptr)) { \
|
||||
ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_ERROR, \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
str, len, __assert_endptr); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* We don't want to disable ws_assert_not_reached() with WS_DISABLE_ASSERT.
|
||||
* That would blast compiler warnings everywhere for no benefit, not
|
||||
|
|
|
@ -1124,6 +1124,78 @@ void ws_log_write_always_full(const char *domain, enum ws_log_level level,
|
|||
}
|
||||
|
||||
|
||||
static char *
|
||||
make_utf8_display(const char *src, size_t src_length, size_t good_length)
|
||||
{
|
||||
wmem_strbuf_t *buf;
|
||||
char ch;
|
||||
size_t offset = 0;
|
||||
|
||||
buf = wmem_strbuf_new(NULL, NULL);
|
||||
|
||||
for (size_t pos = 0; pos < src_length; pos++) {
|
||||
ch = src[pos];
|
||||
if (pos < good_length) {
|
||||
if (g_ascii_isalnum(ch) || ch == ' ') {
|
||||
wmem_strbuf_append_c(buf, ch);
|
||||
offset += 1;
|
||||
}
|
||||
else {
|
||||
wmem_strbuf_append_hex(buf, ch);
|
||||
offset += 4;
|
||||
}
|
||||
}
|
||||
else {
|
||||
wmem_strbuf_append_hex(buf, ch);
|
||||
}
|
||||
}
|
||||
wmem_strbuf_append_c(buf, '\n');
|
||||
for (size_t pos = 0; pos < offset; pos++) {
|
||||
wmem_strbuf_append_c(buf, ' ');
|
||||
}
|
||||
wmem_strbuf_append(buf, "^^^^");
|
||||
for (size_t pos = good_length + 1; pos < src_length; pos++) {
|
||||
wmem_strbuf_append(buf, "~~~~");
|
||||
}
|
||||
return wmem_strbuf_finalize(buf);
|
||||
}
|
||||
|
||||
|
||||
void ws_log_utf8_full(const char *domain, enum ws_log_level level,
|
||||
const char *file, long line, const char *func,
|
||||
const char *string, ssize_t _length, const char *endptr)
|
||||
{
|
||||
if (!ws_log_msg_is_active(domain, level))
|
||||
return;
|
||||
|
||||
char *display;
|
||||
size_t length;
|
||||
size_t good_length;
|
||||
|
||||
if (_length < 0)
|
||||
length = strlen(string);
|
||||
else
|
||||
length = _length;
|
||||
|
||||
if (endptr == NULL || endptr < string) {
|
||||
/* Find the pointer to the first invalid byte. */
|
||||
if (g_utf8_validate(string, length, &endptr)) {
|
||||
/* Valid string - should not happen. */
|
||||
return;
|
||||
}
|
||||
}
|
||||
good_length = endptr - string;
|
||||
|
||||
display = make_utf8_display(string, length, good_length);
|
||||
|
||||
ws_log_write_always_full(domain, level, file, line, func,
|
||||
"Invalid UTF-8 at address %p offset %zu (length = %zu):\n%s",
|
||||
string, good_length, length, display);
|
||||
|
||||
g_free(display);
|
||||
}
|
||||
|
||||
|
||||
void ws_log_buffer_full(const char *domain, enum ws_log_level level,
|
||||
const char *file, long line, const char *func,
|
||||
const uint8_t *ptr, size_t size, size_t max_bytes_len,
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <ws_symbol_export.h>
|
||||
#include <ws_attributes.h>
|
||||
#include <ws_log_defs.h>
|
||||
#include <ws_posix_compat.h>
|
||||
|
||||
#ifdef WS_LOG_DOMAIN
|
||||
#define _LOG_DOMAIN WS_LOG_DOMAIN
|
||||
|
@ -368,6 +369,22 @@ void ws_log_fatal_full(const char *domain, enum ws_log_level level,
|
|||
_LOG_FULL(true, LOG_LEVEL_ECHO, __VA_ARGS__)
|
||||
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
void ws_log_utf8_full(const char *domain, enum ws_log_level level,
|
||||
const char *file, long line, const char *func,
|
||||
const char *string, ssize_t length, const char *endptr);
|
||||
|
||||
|
||||
#define ws_log_utf8(str, len, endptr) \
|
||||
do { \
|
||||
if (_LOG_DEBUG_ENABLED) { \
|
||||
ws_log_utf8_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, \
|
||||
__FILE__, __LINE__, __func__, \
|
||||
str, len, endptr); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
/** This function is called to log a buffer (bytes array).
|
||||
*
|
||||
* Accepts an optional 'msg' argument to provide a description.
|
||||
|
|
Loading…
Reference in New Issue