Move format_text() to libwsutil and add unit tests

This commit is contained in:
João Valverde 2022-09-27 19:26:37 +01:00 committed by A Wireshark GitLab Utility
parent fd97378da4
commit 15634c0b46
5 changed files with 633 additions and 557 deletions

View File

@ -24,8 +24,6 @@
#include <wchar.h>
#endif
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
/*
* Given a pointer into a data buffer, and to the end of the buffer,
@ -124,486 +122,6 @@ get_token_len(const guchar *linep, const guchar *lineend,
return token_len;
}
#define INITIAL_FMTBUF_SIZE 128
/*
* Declare, and initialize, the variables used for an output buffer.
*/
#define FMTBUF_VARS \
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
guint fmtbuf_len = INITIAL_FMTBUF_SIZE; \
guint column = 0
/*
* Expand the buffer to be large enough to add nbytes bytes, plus a
* terminating '\0'.
*/
#define FMTBUF_EXPAND(nbytes) \
/* \
* Is there enough room for those bytes and also enough room for \
* a terminating '\0'? \
*/ \
if (column+(nbytes+1) >= fmtbuf_len) { \
/* \
* Double the buffer's size if it's not big enough. \
* The size of the buffer starts at 128, so doubling its size \
* adds at least another 128 bytes, which is more than enough \
* for one more character plus a terminating '\0'. \
*/ \
fmtbuf_len *= 2; \
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
}
/*
* Put a byte into the buffer; space must have been ensured for it.
*/
#define FMTBUF_PUTCHAR(b) \
fmtbuf[column] = (b); \
column++
/*
* Add the one-byte argument, as an octal escape sequence, to the end
* of the buffer.
*/
#define FMTBUF_PUTBYTE_OCTAL(b) \
FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
/*
* Add the one-byte argument, as a hex escape sequence, to the end
* of the buffer.
*/
#define FMTBUF_PUTBYTE_HEX(b) \
FMTBUF_PUTCHAR('\\'); \
FMTBUF_PUTCHAR('x'); \
FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
/*
* Put the trailing '\0' at the end of the buffer.
*/
#define FMTBUF_ENDSTR \
fmtbuf[column] = '\0'
/* REPLACEMENT CHARACTER */
#define UNREPL 0xFFFD
#define UNPOOP 0x1F4A9
static gchar *
format_text_internal(wmem_allocator_t *allocator,
const guchar *string, size_t len,
gboolean replace_space)
{
FMTBUF_VARS;
const guchar *stringend = string + len;
guchar c;
bool is_valid_utf8 = true;
while (string < stringend) {
/*
* Get the first byte of this character.
*/
c = *string++;
if (g_ascii_isprint(c)) {
/*
* Printable ASCII, so not part of a multi-byte UTF-8 sequence.
* Make sure there's enough room for one more byte, and add
* the character.
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(c);
} else if (replace_space && g_ascii_isspace(c)) {
/*
* ASCII, so not part of a multi-byte UTF-8 sequence, but
* not printable, but is a space character; show it as a
* blank.
*
* Make sure there's enough room for one more byte, and add
* the blank.
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(' ');
} else if (c < 128) {
/*
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
* printable.
*
* That requires a minimum of 2 bytes, one for the backslash
* and one for a letter, so make sure we have enough room
* for that, plus a trailing '\0'.
*/
FMTBUF_EXPAND(2);
FMTBUF_PUTCHAR('\\');
switch (c) {
case '\a':
FMTBUF_PUTCHAR('a');
break;
case '\b':
FMTBUF_PUTCHAR('b'); /* BS */
break;
case '\f':
FMTBUF_PUTCHAR('f'); /* FF */
break;
case '\n':
FMTBUF_PUTCHAR('n'); /* NL */
break;
case '\r':
FMTBUF_PUTCHAR('r'); /* CR */
break;
case '\t':
FMTBUF_PUTCHAR('t'); /* tab */
break;
case '\v':
FMTBUF_PUTCHAR('v');
break;
default:
/*
* We've already put the backslash, but this
* will put 3 more characters for the octal
* number; make sure we have enough room for
* that, plus the trailing '\0'.
*/
FMTBUF_EXPAND(3);
FMTBUF_PUTBYTE_OCTAL(c);
break;
}
} else {
/*
* We've fetched the first byte of a multi-byte UTF-8
* sequence into c.
*/
int utf8_len;
guchar mask;
gunichar uc;
guchar first;
if ((c & 0xe0) == 0xc0) {
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
utf8_len = 1;
mask = 0x1f;
} else if ((c & 0xf0) == 0xe0) {
/* Starts a 3-byte UTF-8 sequence; 2 bytes left */
utf8_len = 2;
mask = 0x0f;
} else if ((c & 0xf8) == 0xf0) {
/* Starts a 4-byte UTF-8 sequence; 3 bytes left */
utf8_len = 3;
mask = 0x07;
} else if ((c & 0xfc) == 0xf8) {
/* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
utf8_len = 4;
mask = 0x03;
} else if ((c & 0xfe) == 0xfc) {
/* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
utf8_len = 5;
mask = 0x01;
} else {
/* 0xfe or 0xff or a continuation byte - not valid */
utf8_len = -1;
}
if (utf8_len > 0) {
/* Try to construct the Unicode character */
uc = c & mask;
for (int i = 0; i < utf8_len; i++) {
if (string >= stringend) {
/*
* Ran out of octets, so the character is
* incomplete. Put in a REPLACEMENT CHARACTER
* instead, and then continue the loop, which
* will terminate.
*/
uc = UNREPL;
break;
}
c = *string;
if ((c & 0xc0) != 0x80) {
/*
* Not valid UTF-8 continuation character; put in
* a replacement character, and then re-process
* this octet as the beginning of a new character.
*/
uc = UNREPL;
break;
}
string++;
uc = (uc << 6) | (c & 0x3f);
}
/*
* If this isn't a valid Unicode character, put in
* a REPLACEMENT CHARACTER.
*/
if (!g_unichar_validate(uc))
uc = UNREPL;
} else {
/* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
uc = UNREPL;
}
if (uc == UNREPL) {
/* Flag this UTF-8 string as having been sanitized. */
is_valid_utf8 = false;
}
/*
* OK, is it a printable Unicode character?
*/
if (g_unichar_isprint(uc)) {
/*
* Yes - put it into the string as UTF-8.
* This means that if it was an overlong
* encoding, this will put out the right
* sized encoding.
*/
if (uc < 0x80) {
first = 0;
utf8_len = 1;
} else if (uc < 0x800) {
first = 0xc0;
utf8_len = 2;
} else if (uc < 0x10000) {
first = 0xe0;
utf8_len = 3;
} else if (uc < 0x200000) {
first = 0xf0;
utf8_len = 4;
} else if (uc < 0x4000000) {
/*
* This should never happen, as Unicode doesn't
* go that high.
*/
first = 0xf8;
utf8_len = 5;
} else {
/*
* This should never happen, as Unicode doesn't
* go that high.
*/
first = 0xfc;
utf8_len = 6;
}
FMTBUF_EXPAND(utf8_len);
for (int i = utf8_len - 1; i > 0; i--) {
fmtbuf[column + i] = (uc & 0x3f) | 0x80;
uc >>= 6;
}
fmtbuf[column] = uc | first;
column += utf8_len;
} else if (replace_space && g_unichar_isspace(uc)) {
/*
* Not printable, but is a space character; show it
* as a blank.
*
* Make sure there's enough room for one more byte,
* and add the blank.
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(' ');
} else if (c < 128) {
/*
* ASCII, but not printable.
* Yes, this could happen with an overlong encoding.
*
* That requires a minimum of 2 bytes, one for the
* backslash and one for a letter, so make sure we
* have enough room for that, plus a trailing '\0'.
*/
FMTBUF_EXPAND(2);
FMTBUF_PUTCHAR('\\');
switch (c) {
case '\a':
FMTBUF_PUTCHAR('a');
break;
case '\b':
FMTBUF_PUTCHAR('b'); /* BS */
break;
case '\f':
FMTBUF_PUTCHAR('f'); /* FF */
break;
case '\n':
FMTBUF_PUTCHAR('n'); /* NL */
break;
case '\r':
FMTBUF_PUTCHAR('r'); /* CR */
break;
case '\t':
FMTBUF_PUTCHAR('t'); /* tab */
break;
case '\v':
FMTBUF_PUTCHAR('v');
break;
default:
/*
* We've already put the backslash, but this
* will put 3 more characters for the octal
* number; make sure we have enough room for
* that, plus the trailing '\0'.
*/
FMTBUF_EXPAND(3);
FMTBUF_PUTBYTE_OCTAL(c);
break;
}
} else {
/*
* Unicode, but not printable, and not ASCII;
* put it out as \uxxxx or \Uxxxxxxxx.
*/
if (uc <= 0xFFFF) {
FMTBUF_EXPAND(6);
FMTBUF_PUTCHAR('\\');
FMTBUF_PUTCHAR('u');
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
} else {
FMTBUF_EXPAND(10);
FMTBUF_PUTCHAR('\\');
FMTBUF_PUTCHAR('U');
FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
}
}
}
}
FMTBUF_ENDSTR;
if (!is_valid_utf8) {
/* This function expects valid UTF-8 as input. The extra validation performed is a safeguard.
* In a brighter future it may be removed. Emit a warning and display the sanitized string. */
ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
"String argument contained UTF-8 errors: %s", fmtbuf);
}
return fmtbuf;
}
/*
* Given a wmem scope, a not-necessarily-null-terminated string,
* expected to be in UTF-8 but possibly containing invalid sequences
* (as it may have come from packet data), and the length of the string,
* generate a valid UTF-8 string from it, allocated in the specified
* wmem scope, that:
*
* shows printable Unicode characters as themselves;
*
* shows non-printable ASCII characters as C-style escapes (octal
* if not one of the standard ones such as LF -> '\n');
*
* shows non-printable Unicode-but-not-ASCII characters as
* their universal character names;
*
* shows illegal UTF-8 sequences as a sequence of bytes represented
* as C-style hex escapes (XXX: Does not actually do this. Some illegal
* sequences, such as overlong encodings, the sequences reserved for
* UTF-16 surrogate halves (paired or unpaired), and values outside
* Unicode (i.e., the old sequences for code points above U+10FFFF)
* will be decoded in a permissive way. Other illegal sequences,
* such 0xFE and 0xFF and the presence of a continuation byte where
* not expected (or vice versa its absence), are replaced with
* REPLACEMENT CHARACTER.)
*
* and return a pointer to it.
*/
char *
format_text(wmem_allocator_t *allocator,
const char *string, size_t len)
{
return format_text_internal(allocator, string, len, FALSE);
}
/** Given a wmem scope and a null-terminated string, expected to be in
* UTF-8 but possibly containing invalid sequences (as it may have come
* from packet data), and the length of the string, generate a valid
* UTF-8 string from it, allocated in the specified wmem scope, that:
*
* shows printable Unicode characters as themselves;
*
* shows non-printable ASCII characters as C-style escapes (octal
* if not one of the standard ones such as LF -> '\n');
*
* shows non-printable Unicode-but-not-ASCII characters as
* their universal character names;
*
* shows illegal UTF-8 sequences as a sequence of bytes represented
* as C-style hex escapes;
*
* and return a pointer to it.
*/
char *
format_text_string(wmem_allocator_t* allocator, const char *string)
{
return format_text_internal(allocator, string, strlen(string), FALSE);
}
/*
* Given a string, generate a string from it that shows non-printable
* characters as C-style escapes except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
*/
char *
format_text_wsp(wmem_allocator_t* allocator, const char *string, size_t len)
{
return format_text_internal(allocator, string, len, TRUE);
}
/*
* Given a string, generate a string from it that shows non-printable
* characters as the chr parameter passed, except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
*
* This does *not* treat the input string as UTF-8.
*
* This is useful for displaying binary data that frequently but not always
* contains text; otherwise the number of C escape codes makes it unreadable.
*/
char *
format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, char chr)
{
wmem_strbuf_t *buf;
buf = wmem_strbuf_sized_new(allocator, len + 1, 0);
for (const char *p = string; p < string + len; p++) {
if (g_ascii_isprint(*p)) {
wmem_strbuf_append_c(buf, *p);
}
else if (g_ascii_isspace(*p)) {
wmem_strbuf_append_c(buf, ' ');
}
else {
wmem_strbuf_append_c(buf, chr);
}
}
return wmem_strbuf_finalize(buf);
}
static gboolean
is_byte_sep(guint8 c)
{

View File

@ -47,81 +47,6 @@ WS_DLL_PUBLIC
int get_token_len(const guchar *linep, const guchar *lineend,
const guchar **next_token);
/** Given a wmem scope, a not-necessarily-null-terminated string,
* expected to be in UTF-8 and the length of the string,
* generate a valid UTF-8 string from it, allocated in the specified
* wmem scope, that:
*
* shows printable Unicode characters as themselves;
*
* shows non-printable ASCII characters as C-style escapes (octal
* if not one of the standard ones such as LF -> '\n');
*
* shows non-printable Unicode-but-not-ASCII characters as
* their universal character names;
*
* Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
*
* and return a pointer to it.
*
* @param allocator The wmem scope
* @param string A pointer to the input string
* @param len The length of the input string
* @return A pointer to the formatted string
*
* @see tvb_format_text()
*/
WS_DLL_PUBLIC
char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);
/** Same as format_text() but accepts a nul-terminated string.
*
* @param allocator The wmem scope
* @param string A pointer to the input string
* @return A pointer to the formatted string
*
* @see tvb_format_text()
*/
WS_DLL_PUBLIC
char *format_text_string(wmem_allocator_t* allocator, const char *string);
/**
* Same as format_text() but replaces any whitespace characters
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* with a space.
*
* @param allocator The wmem scope
* @param line A pointer to the input string
* @param len The length of the input string
* @return A pointer to the formatted string
*
*/
WS_DLL_PUBLIC
char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);
/**
* Given a string, generate a string from it that shows non-printable
* characters as the chr parameter passed, except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
*
* This does *not* treat the input string as UTF-8.
*
* This is useful for displaying binary data that frequently but not always
* contains text; otherwise the number of C escape codes makes it unreadable.
*
* @param allocator The wmem scope
* @param string A pointer to the input string
* @param len The length of the input string
* @param chr The character to use to replace non-printable characters
* @return A pointer to the formatted string
*
*/
WS_DLL_PUBLIC
char *format_text_chr(wmem_allocator_t *allocator,
const char *string, size_t len, char chr);
/** Turn a string of hex digits with optional separators (defined by
* is_byte_sep() into a byte array.
*

View File

@ -16,6 +16,10 @@
#include <wsutil/to_str.h>
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
gchar *
wmem_strconcat(wmem_allocator_t *allocator, const gchar *first, ...)
{
@ -560,6 +564,485 @@ ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len)
return wmem_strbuf_finalize(buf);
}
#define INITIAL_FMTBUF_SIZE 128
/*
* Declare, and initialize, the variables used for an output buffer.
*/
#define FMTBUF_VARS \
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
guint fmtbuf_len = INITIAL_FMTBUF_SIZE; \
guint column = 0
/*
* Expand the buffer to be large enough to add nbytes bytes, plus a
* terminating '\0'.
*/
#define FMTBUF_EXPAND(nbytes) \
/* \
* Is there enough room for those bytes and also enough room for \
* a terminating '\0'? \
*/ \
if (column+(nbytes+1) >= fmtbuf_len) { \
/* \
* Double the buffer's size if it's not big enough. \
* The size of the buffer starts at 128, so doubling its size \
* adds at least another 128 bytes, which is more than enough \
* for one more character plus a terminating '\0'. \
*/ \
fmtbuf_len *= 2; \
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
}
/*
* Put a byte into the buffer; space must have been ensured for it.
*/
#define FMTBUF_PUTCHAR(b) \
fmtbuf[column] = (b); \
column++
/*
* Add the one-byte argument, as an octal escape sequence, to the end
* of the buffer.
*/
#define FMTBUF_PUTBYTE_OCTAL(b) \
FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
/*
* Add the one-byte argument, as a hex escape sequence, to the end
* of the buffer.
*/
#define FMTBUF_PUTBYTE_HEX(b) \
FMTBUF_PUTCHAR('\\'); \
FMTBUF_PUTCHAR('x'); \
FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
/*
* Put the trailing '\0' at the end of the buffer.
*/
#define FMTBUF_ENDSTR \
fmtbuf[column] = '\0'
/* REPLACEMENT CHARACTER */
#define UNREPL 0xFFFD
#define UNPOOP 0x1F4A9
static gchar *
format_text_internal(wmem_allocator_t *allocator,
const guchar *string, size_t len,
gboolean replace_space)
{
FMTBUF_VARS;
const guchar *stringend = string + len;
guchar c;
bool is_valid_utf8 = true;
while (string < stringend) {
/*
* Get the first byte of this character.
*/
c = *string++;
if (g_ascii_isprint(c)) {
/*
* Printable ASCII, so not part of a multi-byte UTF-8 sequence.
* Make sure there's enough room for one more byte, and add
* the character.
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(c);
} else if (replace_space && g_ascii_isspace(c)) {
/*
* ASCII, so not part of a multi-byte UTF-8 sequence, but
* not printable, but is a space character; show it as a
* blank.
*
* Make sure there's enough room for one more byte, and add
* the blank.
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(' ');
} else if (c < 128) {
/*
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
* printable.
*
* That requires a minimum of 2 bytes, one for the backslash
* and one for a letter, so make sure we have enough room
* for that, plus a trailing '\0'.
*/
FMTBUF_EXPAND(2);
FMTBUF_PUTCHAR('\\');
switch (c) {
case '\a':
FMTBUF_PUTCHAR('a');
break;
case '\b':
FMTBUF_PUTCHAR('b'); /* BS */
break;
case '\f':
FMTBUF_PUTCHAR('f'); /* FF */
break;
case '\n':
FMTBUF_PUTCHAR('n'); /* NL */
break;
case '\r':
FMTBUF_PUTCHAR('r'); /* CR */
break;
case '\t':
FMTBUF_PUTCHAR('t'); /* tab */
break;
case '\v':
FMTBUF_PUTCHAR('v');
break;
default:
/*
* We've already put the backslash, but this
* will put 3 more characters for the octal
* number; make sure we have enough room for
* that, plus the trailing '\0'.
*/
FMTBUF_EXPAND(3);
FMTBUF_PUTBYTE_OCTAL(c);
break;
}
} else {
/*
* We've fetched the first byte of a multi-byte UTF-8
* sequence into c.
*/
int utf8_len;
guchar mask;
gunichar uc;
guchar first;
if ((c & 0xe0) == 0xc0) {
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
utf8_len = 1;
mask = 0x1f;
} else if ((c & 0xf0) == 0xe0) {
/* Starts a 3-byte UTF-8 sequence; 2 bytes left */
utf8_len = 2;
mask = 0x0f;
} else if ((c & 0xf8) == 0xf0) {
/* Starts a 4-byte UTF-8 sequence; 3 bytes left */
utf8_len = 3;
mask = 0x07;
} else if ((c & 0xfc) == 0xf8) {
/* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
utf8_len = 4;
mask = 0x03;
} else if ((c & 0xfe) == 0xfc) {
/* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
utf8_len = 5;
mask = 0x01;
} else {
/* 0xfe or 0xff or a continuation byte - not valid */
utf8_len = -1;
}
if (utf8_len > 0) {
/* Try to construct the Unicode character */
uc = c & mask;
for (int i = 0; i < utf8_len; i++) {
if (string >= stringend) {
/*
* Ran out of octets, so the character is
* incomplete. Put in a REPLACEMENT CHARACTER
* instead, and then continue the loop, which
* will terminate.
*/
uc = UNREPL;
break;
}
c = *string;
if ((c & 0xc0) != 0x80) {
/*
* Not valid UTF-8 continuation character; put in
* a replacement character, and then re-process
* this octet as the beginning of a new character.
*/
uc = UNREPL;
break;
}
string++;
uc = (uc << 6) | (c & 0x3f);
}
/*
* If this isn't a valid Unicode character, put in
* a REPLACEMENT CHARACTER.
*/
if (!g_unichar_validate(uc))
uc = UNREPL;
} else {
/* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
uc = UNREPL;
}
if (uc == UNREPL) {
/* Flag this UTF-8 string as having been sanitized. */
is_valid_utf8 = false;
}
/*
* OK, is it a printable Unicode character?
*/
if (g_unichar_isprint(uc)) {
/*
* Yes - put it into the string as UTF-8.
* This means that if it was an overlong
* encoding, this will put out the right
* sized encoding.
*/
if (uc < 0x80) {
first = 0;
utf8_len = 1;
} else if (uc < 0x800) {
first = 0xc0;
utf8_len = 2;
} else if (uc < 0x10000) {
first = 0xe0;
utf8_len = 3;
} else if (uc < 0x200000) {
first = 0xf0;
utf8_len = 4;
} else if (uc < 0x4000000) {
/*
* This should never happen, as Unicode doesn't
* go that high.
*/
first = 0xf8;
utf8_len = 5;
} else {
/*
* This should never happen, as Unicode doesn't
* go that high.
*/
first = 0xfc;
utf8_len = 6;
}
FMTBUF_EXPAND(utf8_len);
for (int i = utf8_len - 1; i > 0; i--) {
fmtbuf[column + i] = (uc & 0x3f) | 0x80;
uc >>= 6;
}
fmtbuf[column] = uc | first;
column += utf8_len;
} else if (replace_space && g_unichar_isspace(uc)) {
/*
* Not printable, but is a space character; show it
* as a blank.
*
* Make sure there's enough room for one more byte,
* and add the blank.
*/
FMTBUF_EXPAND(1);
FMTBUF_PUTCHAR(' ');
} else if (c < 128) {
/*
* ASCII, but not printable.
* Yes, this could happen with an overlong encoding.
*
* That requires a minimum of 2 bytes, one for the
* backslash and one for a letter, so make sure we
* have enough room for that, plus a trailing '\0'.
*/
FMTBUF_EXPAND(2);
FMTBUF_PUTCHAR('\\');
switch (c) {
case '\a':
FMTBUF_PUTCHAR('a');
break;
case '\b':
FMTBUF_PUTCHAR('b'); /* BS */
break;
case '\f':
FMTBUF_PUTCHAR('f'); /* FF */
break;
case '\n':
FMTBUF_PUTCHAR('n'); /* NL */
break;
case '\r':
FMTBUF_PUTCHAR('r'); /* CR */
break;
case '\t':
FMTBUF_PUTCHAR('t'); /* tab */
break;
case '\v':
FMTBUF_PUTCHAR('v');
break;
default:
/*
* We've already put the backslash, but this
* will put 3 more characters for the octal
* number; make sure we have enough room for
* that, plus the trailing '\0'.
*/
FMTBUF_EXPAND(3);
FMTBUF_PUTBYTE_OCTAL(c);
break;
}
} else {
/*
* Unicode, but not printable, and not ASCII;
* put it out as \uxxxx or \Uxxxxxxxx.
*/
if (uc <= 0xFFFF) {
FMTBUF_EXPAND(6);
FMTBUF_PUTCHAR('\\');
FMTBUF_PUTCHAR('u');
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
} else {
FMTBUF_EXPAND(10);
FMTBUF_PUTCHAR('\\');
FMTBUF_PUTCHAR('U');
FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
}
}
}
}
FMTBUF_ENDSTR;
if (!is_valid_utf8) {
/* This function expects valid UTF-8 as input. The extra validation performed is a safeguard.
* In a brighter future it may be removed. Emit a warning and display the sanitized string. */
ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
"String argument contained UTF-8 errors: %s", fmtbuf);
}
return fmtbuf;
}
/*
* Given a wmem scope, a not-necessarily-null-terminated string,
* expected to be in UTF-8 but possibly containing invalid sequences
* (as it may have come from packet data), and the length of the string,
* generate a valid UTF-8 string from it, allocated in the specified
* wmem scope, that:
*
* shows printable Unicode characters as themselves;
*
* shows non-printable ASCII characters as C-style escapes (octal
* if not one of the standard ones such as LF -> '\n');
*
* shows non-printable Unicode-but-not-ASCII characters as
* their universal character names;
*
* shows illegal UTF-8 sequences as a sequence of bytes represented
* as C-style hex escapes (XXX: Does not actually do this. Some illegal
* sequences, such as overlong encodings, the sequences reserved for
* UTF-16 surrogate halves (paired or unpaired), and values outside
* Unicode (i.e., the old sequences for code points above U+10FFFF)
* will be decoded in a permissive way. Other illegal sequences,
* such 0xFE and 0xFF and the presence of a continuation byte where
* not expected (or vice versa its absence), are replaced with
* REPLACEMENT CHARACTER.)
*
* and return a pointer to it.
*/
char *
format_text(wmem_allocator_t *allocator,
const char *string, size_t len)
{
return format_text_internal(allocator, string, len, FALSE);
}
/** Given a wmem scope and a null-terminated string, expected to be in
* UTF-8 but possibly containing invalid sequences (as it may have come
* from packet data), and the length of the string, generate a valid
* UTF-8 string from it, allocated in the specified wmem scope, that:
*
* shows printable Unicode characters as themselves;
*
* shows non-printable ASCII characters as C-style escapes (octal
* if not one of the standard ones such as LF -> '\n');
*
* shows non-printable Unicode-but-not-ASCII characters as
* their universal character names;
*
* shows illegal UTF-8 sequences as a sequence of bytes represented
* as C-style hex escapes;
*
* and return a pointer to it.
*/
char *
format_text_string(wmem_allocator_t* allocator, const char *string)
{
return format_text_internal(allocator, string, strlen(string), FALSE);
}
/*
* Given a string, generate a string from it that shows non-printable
* characters as C-style escapes except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
*/
char *
format_text_wsp(wmem_allocator_t* allocator, const char *string, size_t len)
{
return format_text_internal(allocator, string, len, TRUE);
}
/*
* Given a string, generate a string from it that shows non-printable
* characters as the chr parameter passed, except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
*
* This does *not* treat the input string as UTF-8.
*
* This is useful for displaying binary data that frequently but not always
* contains text; otherwise the number of C escape codes makes it unreadable.
*/
char *
format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, char chr)
{
wmem_strbuf_t *buf;
buf = wmem_strbuf_sized_new(allocator, len + 1, 0);
for (const char *p = string; p < string + len; p++) {
if (g_ascii_isprint(*p)) {
wmem_strbuf_append_c(buf, *p);
}
else if (g_ascii_isspace(*p)) {
wmem_strbuf_append_c(buf, ' ');
}
else {
wmem_strbuf_append_c(buf, chr);
}
}
return wmem_strbuf_finalize(buf);
}
/*
* Editor modelines - https://www.wireshark.org/tools/modelines.html
*

View File

@ -213,6 +213,80 @@ const char *ws_strerrorname_r(int errnum, char *buf, size_t buf_size);
WS_DLL_PUBLIC
char *ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len);
/** Given a wmem scope, a not-necessarily-null-terminated string,
* expected to be in UTF-8 and the length of the string,
* generate a valid UTF-8 string from it, allocated in the specified
* wmem scope, that:
*
* shows printable Unicode characters as themselves;
*
* shows non-printable ASCII characters as C-style escapes (octal
* if not one of the standard ones such as LF -> '\n');
*
* shows non-printable Unicode-but-not-ASCII characters as
* their universal character names;
*
* Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
*
* and return a pointer to it.
*
* @param allocator The wmem scope
* @param string A pointer to the input string
* @param len The length of the input string
* @return A pointer to the formatted string
*
* @see tvb_format_text()
*/
WS_DLL_PUBLIC
char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);
/** Same as format_text() but accepts a nul-terminated string.
*
* @param allocator The wmem scope
* @param string A pointer to the input string
* @return A pointer to the formatted string
*
* @see tvb_format_text()
*/
WS_DLL_PUBLIC
char *format_text_string(wmem_allocator_t* allocator, const char *string);
/**
* Same as format_text() but replaces any whitespace characters
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* with a space.
*
* @param allocator The wmem scope
* @param line A pointer to the input string
* @param len The length of the input string
* @return A pointer to the formatted string
*
*/
WS_DLL_PUBLIC
char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);
/**
* Given a string, generate a string from it that shows non-printable
* characters as the chr parameter passed, except a whitespace character
* (space, tab, carriage return, new line, vertical tab, or formfeed)
* which will be replaced by a space, and return a pointer to it.
*
* This does *not* treat the input string as UTF-8.
*
* This is useful for displaying binary data that frequently but not always
* contains text; otherwise the number of C escape codes makes it unreadable.
*
* @param allocator The wmem scope
* @param string A pointer to the input string
* @param len The length of the input string
* @param chr The character to use to replace non-printable characters
* @return A pointer to the formatted string
*
*/
WS_DLL_PUBLIC
char *format_text_chr(wmem_allocator_t *allocator,
const char *string, size_t len, char chr);
/* To pass one of two strings, singular or plural */
#define plurality(d,s,p) ((d) == 1 ? (s) : (p))

View File

@ -11,6 +11,7 @@
#include <stdio.h>
#include <glib.h>
#include <wsutil/utf8_entities.h>
#include <wsutil/time_util.h>
#include "inet_addr.h"
@ -185,6 +186,74 @@ static void test_str_ascii(void)
wmem_destroy_allocator(allocator);
}
static void test_format_text(void)
{
const char *have, *want;
char *res;
/* ASCII */
have = "abcdef";
want = "abcdef";
res = format_text_string(NULL, have);
g_assert_cmpstr(res, ==, want);
g_free(res);
/* ASCII with special escape characters. */
have = "abc\td\fe\nf";
want = "abc\\td\\fe\\nf";
res = format_text_string(NULL, have);
g_assert_cmpstr(res, ==, want);
g_free(res);
/* ASCII with non-printable characters. */
have = "abc \004 def";
want = "abc \\004 def";
res = format_text_string(NULL, have);
g_assert_cmpstr(res, ==, want);
g_free(res);
/* UTF-8 */
have = u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο";
want = u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο";
res = format_text_string(NULL, have);
g_assert_cmpstr(res, ==, want);
g_free(res);
/* UTF-8 with non-ASCII non-printable characters. */
have = u8"String with BOM \ufeff";
want = u8"String with BOM \\uFEFF";
res = format_text_string(NULL, have);
g_assert_cmpstr(res, ==, want);
g_free(res);
}
#define RESOURCE_USAGE_START get_resource_usage(&start_utime, &start_stime)
#define RESOURCE_USAGE_END \
get_resource_usage(&end_utime, &end_stime); \
utime_ms = (end_utime - start_utime) * 1000.0; \
stime_ms = (end_stime - start_stime) * 1000.0
static void test_format_text_perf(void)
{
#define LOOP_COUNT (1 * 1000 * 1000)
char *str;
int i;
double start_utime, start_stime, end_utime, end_stime, utime_ms, stime_ms;
const char *text = "The quick brown fox\tjumps over the lazy \001dog"UTF8_HORIZONTAL_ELLIPSIS"\n";
RESOURCE_USAGE_START;
for (i = 0; i < LOOP_COUNT; i++) {
str = format_text_string(NULL, text);
g_free(str);
}
RESOURCE_USAGE_END;
g_test_minimized_result(utime_ms + stime_ms,
"format_text_string(): u %.3f ms s %.3f ms", utime_ms, stime_ms);
}
#include "to_str.h"
static void test_word_to_hex(void)
@ -743,6 +812,8 @@ int main(int argc, char **argv)
{
int ret;
ws_log_init("test_wsutil", NULL);
g_test_init(&argc, &argv, NULL);
g_test_add_func("/inet_addr/inet_pton4", test_inet_pton4_test1);
@ -755,6 +826,11 @@ int main(int argc, char **argv)
g_test_add_func("/str_util/strconcat", test_strconcat);
g_test_add_func("/str_util/strsplit", test_strsplit);
g_test_add_func("/str_util/str_ascii", test_str_ascii);
g_test_add_func("/str_util/format_text", test_format_text);
if (g_test_perf()) {
g_test_add_func("/str_util/format_text_perf", test_format_text_perf);
}
g_test_add_func("/to_str/word_to_hex", test_word_to_hex);
g_test_add_func("/to_str/bytes_to_str", test_bytes_to_str);