forked from osmocom/wireshark
Move format_text() to libwsutil and add unit tests
This commit is contained in:
parent
fd97378da4
commit
15634c0b46
482
epan/strutil.c
482
epan/strutil.c
|
@ -24,8 +24,6 @@
|
|||
#include <wchar.h>
|
||||
#endif
|
||||
|
||||
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
|
||||
|
||||
/*
|
||||
* Given a pointer into a data buffer, and to the end of the buffer,
|
||||
|
@ -124,486 +122,6 @@ get_token_len(const guchar *linep, const guchar *lineend,
|
|||
return token_len;
|
||||
}
|
||||
|
||||
|
||||
#define INITIAL_FMTBUF_SIZE 128
|
||||
|
||||
/*
|
||||
* Declare, and initialize, the variables used for an output buffer.
|
||||
*/
|
||||
#define FMTBUF_VARS \
|
||||
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
|
||||
guint fmtbuf_len = INITIAL_FMTBUF_SIZE; \
|
||||
guint column = 0
|
||||
|
||||
/*
|
||||
* Expand the buffer to be large enough to add nbytes bytes, plus a
|
||||
* terminating '\0'.
|
||||
*/
|
||||
#define FMTBUF_EXPAND(nbytes) \
|
||||
/* \
|
||||
* Is there enough room for those bytes and also enough room for \
|
||||
* a terminating '\0'? \
|
||||
*/ \
|
||||
if (column+(nbytes+1) >= fmtbuf_len) { \
|
||||
/* \
|
||||
* Double the buffer's size if it's not big enough. \
|
||||
* The size of the buffer starts at 128, so doubling its size \
|
||||
* adds at least another 128 bytes, which is more than enough \
|
||||
* for one more character plus a terminating '\0'. \
|
||||
*/ \
|
||||
fmtbuf_len *= 2; \
|
||||
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
|
||||
}
|
||||
|
||||
/*
|
||||
* Put a byte into the buffer; space must have been ensured for it.
|
||||
*/
|
||||
#define FMTBUF_PUTCHAR(b) \
|
||||
fmtbuf[column] = (b); \
|
||||
column++
|
||||
|
||||
/*
|
||||
* Add the one-byte argument, as an octal escape sequence, to the end
|
||||
* of the buffer.
|
||||
*/
|
||||
#define FMTBUF_PUTBYTE_OCTAL(b) \
|
||||
FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
|
||||
FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
|
||||
FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
|
||||
|
||||
/*
|
||||
* Add the one-byte argument, as a hex escape sequence, to the end
|
||||
* of the buffer.
|
||||
*/
|
||||
#define FMTBUF_PUTBYTE_HEX(b) \
|
||||
FMTBUF_PUTCHAR('\\'); \
|
||||
FMTBUF_PUTCHAR('x'); \
|
||||
FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
|
||||
FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
|
||||
|
||||
/*
|
||||
* Put the trailing '\0' at the end of the buffer.
|
||||
*/
|
||||
#define FMTBUF_ENDSTR \
|
||||
fmtbuf[column] = '\0'
|
||||
|
||||
/* REPLACEMENT CHARACTER */
|
||||
#define UNREPL 0xFFFD
|
||||
|
||||
#define UNPOOP 0x1F4A9
|
||||
|
||||
static gchar *
|
||||
format_text_internal(wmem_allocator_t *allocator,
|
||||
const guchar *string, size_t len,
|
||||
gboolean replace_space)
|
||||
{
|
||||
FMTBUF_VARS;
|
||||
const guchar *stringend = string + len;
|
||||
guchar c;
|
||||
bool is_valid_utf8 = true;
|
||||
|
||||
while (string < stringend) {
|
||||
/*
|
||||
* Get the first byte of this character.
|
||||
*/
|
||||
c = *string++;
|
||||
if (g_ascii_isprint(c)) {
|
||||
/*
|
||||
* Printable ASCII, so not part of a multi-byte UTF-8 sequence.
|
||||
* Make sure there's enough room for one more byte, and add
|
||||
* the character.
|
||||
*/
|
||||
FMTBUF_EXPAND(1);
|
||||
FMTBUF_PUTCHAR(c);
|
||||
} else if (replace_space && g_ascii_isspace(c)) {
|
||||
/*
|
||||
* ASCII, so not part of a multi-byte UTF-8 sequence, but
|
||||
* not printable, but is a space character; show it as a
|
||||
* blank.
|
||||
*
|
||||
* Make sure there's enough room for one more byte, and add
|
||||
* the blank.
|
||||
*/
|
||||
FMTBUF_EXPAND(1);
|
||||
FMTBUF_PUTCHAR(' ');
|
||||
} else if (c < 128) {
|
||||
/*
|
||||
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
|
||||
* printable.
|
||||
*
|
||||
* That requires a minimum of 2 bytes, one for the backslash
|
||||
* and one for a letter, so make sure we have enough room
|
||||
* for that, plus a trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(2);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
switch (c) {
|
||||
|
||||
case '\a':
|
||||
FMTBUF_PUTCHAR('a');
|
||||
break;
|
||||
|
||||
case '\b':
|
||||
FMTBUF_PUTCHAR('b'); /* BS */
|
||||
break;
|
||||
|
||||
case '\f':
|
||||
FMTBUF_PUTCHAR('f'); /* FF */
|
||||
break;
|
||||
|
||||
case '\n':
|
||||
FMTBUF_PUTCHAR('n'); /* NL */
|
||||
break;
|
||||
|
||||
case '\r':
|
||||
FMTBUF_PUTCHAR('r'); /* CR */
|
||||
break;
|
||||
|
||||
case '\t':
|
||||
FMTBUF_PUTCHAR('t'); /* tab */
|
||||
break;
|
||||
|
||||
case '\v':
|
||||
FMTBUF_PUTCHAR('v');
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* We've already put the backslash, but this
|
||||
* will put 3 more characters for the octal
|
||||
* number; make sure we have enough room for
|
||||
* that, plus the trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(3);
|
||||
FMTBUF_PUTBYTE_OCTAL(c);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We've fetched the first byte of a multi-byte UTF-8
|
||||
* sequence into c.
|
||||
*/
|
||||
int utf8_len;
|
||||
guchar mask;
|
||||
gunichar uc;
|
||||
guchar first;
|
||||
|
||||
if ((c & 0xe0) == 0xc0) {
|
||||
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
|
||||
utf8_len = 1;
|
||||
mask = 0x1f;
|
||||
} else if ((c & 0xf0) == 0xe0) {
|
||||
/* Starts a 3-byte UTF-8 sequence; 2 bytes left */
|
||||
utf8_len = 2;
|
||||
mask = 0x0f;
|
||||
} else if ((c & 0xf8) == 0xf0) {
|
||||
/* Starts a 4-byte UTF-8 sequence; 3 bytes left */
|
||||
utf8_len = 3;
|
||||
mask = 0x07;
|
||||
} else if ((c & 0xfc) == 0xf8) {
|
||||
/* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
|
||||
utf8_len = 4;
|
||||
mask = 0x03;
|
||||
} else if ((c & 0xfe) == 0xfc) {
|
||||
/* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
|
||||
utf8_len = 5;
|
||||
mask = 0x01;
|
||||
} else {
|
||||
/* 0xfe or 0xff or a continuation byte - not valid */
|
||||
utf8_len = -1;
|
||||
}
|
||||
if (utf8_len > 0) {
|
||||
/* Try to construct the Unicode character */
|
||||
uc = c & mask;
|
||||
for (int i = 0; i < utf8_len; i++) {
|
||||
if (string >= stringend) {
|
||||
/*
|
||||
* Ran out of octets, so the character is
|
||||
* incomplete. Put in a REPLACEMENT CHARACTER
|
||||
* instead, and then continue the loop, which
|
||||
* will terminate.
|
||||
*/
|
||||
uc = UNREPL;
|
||||
break;
|
||||
}
|
||||
c = *string;
|
||||
if ((c & 0xc0) != 0x80) {
|
||||
/*
|
||||
* Not valid UTF-8 continuation character; put in
|
||||
* a replacement character, and then re-process
|
||||
* this octet as the beginning of a new character.
|
||||
*/
|
||||
uc = UNREPL;
|
||||
break;
|
||||
}
|
||||
string++;
|
||||
uc = (uc << 6) | (c & 0x3f);
|
||||
}
|
||||
|
||||
/*
|
||||
* If this isn't a valid Unicode character, put in
|
||||
* a REPLACEMENT CHARACTER.
|
||||
*/
|
||||
if (!g_unichar_validate(uc))
|
||||
uc = UNREPL;
|
||||
} else {
|
||||
/* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
|
||||
uc = UNREPL;
|
||||
}
|
||||
|
||||
if (uc == UNREPL) {
|
||||
/* Flag this UTF-8 string as having been sanitized. */
|
||||
is_valid_utf8 = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, is it a printable Unicode character?
|
||||
*/
|
||||
if (g_unichar_isprint(uc)) {
|
||||
/*
|
||||
* Yes - put it into the string as UTF-8.
|
||||
* This means that if it was an overlong
|
||||
* encoding, this will put out the right
|
||||
* sized encoding.
|
||||
*/
|
||||
if (uc < 0x80) {
|
||||
first = 0;
|
||||
utf8_len = 1;
|
||||
} else if (uc < 0x800) {
|
||||
first = 0xc0;
|
||||
utf8_len = 2;
|
||||
} else if (uc < 0x10000) {
|
||||
first = 0xe0;
|
||||
utf8_len = 3;
|
||||
} else if (uc < 0x200000) {
|
||||
first = 0xf0;
|
||||
utf8_len = 4;
|
||||
} else if (uc < 0x4000000) {
|
||||
/*
|
||||
* This should never happen, as Unicode doesn't
|
||||
* go that high.
|
||||
*/
|
||||
first = 0xf8;
|
||||
utf8_len = 5;
|
||||
} else {
|
||||
/*
|
||||
* This should never happen, as Unicode doesn't
|
||||
* go that high.
|
||||
*/
|
||||
first = 0xfc;
|
||||
utf8_len = 6;
|
||||
}
|
||||
FMTBUF_EXPAND(utf8_len);
|
||||
for (int i = utf8_len - 1; i > 0; i--) {
|
||||
fmtbuf[column + i] = (uc & 0x3f) | 0x80;
|
||||
uc >>= 6;
|
||||
}
|
||||
fmtbuf[column] = uc | first;
|
||||
column += utf8_len;
|
||||
} else if (replace_space && g_unichar_isspace(uc)) {
|
||||
/*
|
||||
* Not printable, but is a space character; show it
|
||||
* as a blank.
|
||||
*
|
||||
* Make sure there's enough room for one more byte,
|
||||
* and add the blank.
|
||||
*/
|
||||
FMTBUF_EXPAND(1);
|
||||
FMTBUF_PUTCHAR(' ');
|
||||
} else if (c < 128) {
|
||||
/*
|
||||
* ASCII, but not printable.
|
||||
* Yes, this could happen with an overlong encoding.
|
||||
*
|
||||
* That requires a minimum of 2 bytes, one for the
|
||||
* backslash and one for a letter, so make sure we
|
||||
* have enough room for that, plus a trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(2);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
switch (c) {
|
||||
|
||||
case '\a':
|
||||
FMTBUF_PUTCHAR('a');
|
||||
break;
|
||||
|
||||
case '\b':
|
||||
FMTBUF_PUTCHAR('b'); /* BS */
|
||||
break;
|
||||
|
||||
case '\f':
|
||||
FMTBUF_PUTCHAR('f'); /* FF */
|
||||
break;
|
||||
|
||||
case '\n':
|
||||
FMTBUF_PUTCHAR('n'); /* NL */
|
||||
break;
|
||||
|
||||
case '\r':
|
||||
FMTBUF_PUTCHAR('r'); /* CR */
|
||||
break;
|
||||
|
||||
case '\t':
|
||||
FMTBUF_PUTCHAR('t'); /* tab */
|
||||
break;
|
||||
|
||||
case '\v':
|
||||
FMTBUF_PUTCHAR('v');
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* We've already put the backslash, but this
|
||||
* will put 3 more characters for the octal
|
||||
* number; make sure we have enough room for
|
||||
* that, plus the trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(3);
|
||||
FMTBUF_PUTBYTE_OCTAL(c);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Unicode, but not printable, and not ASCII;
|
||||
* put it out as \uxxxx or \Uxxxxxxxx.
|
||||
*/
|
||||
if (uc <= 0xFFFF) {
|
||||
FMTBUF_EXPAND(6);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
FMTBUF_PUTCHAR('u');
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||
} else {
|
||||
FMTBUF_EXPAND(10);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
FMTBUF_PUTCHAR('U');
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FMTBUF_ENDSTR;
|
||||
|
||||
if (!is_valid_utf8) {
|
||||
/* This function expects valid UTF-8 as input. The extra validation performed is a safeguard.
|
||||
* In a brighter future it may be removed. Emit a warning and display the sanitized string. */
|
||||
ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
|
||||
"String argument contained UTF-8 errors: %s", fmtbuf);
|
||||
}
|
||||
return fmtbuf;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a wmem scope, a not-necessarily-null-terminated string,
|
||||
* expected to be in UTF-8 but possibly containing invalid sequences
|
||||
* (as it may have come from packet data), and the length of the string,
|
||||
* generate a valid UTF-8 string from it, allocated in the specified
|
||||
* wmem scope, that:
|
||||
*
|
||||
* shows printable Unicode characters as themselves;
|
||||
*
|
||||
* shows non-printable ASCII characters as C-style escapes (octal
|
||||
* if not one of the standard ones such as LF -> '\n');
|
||||
*
|
||||
* shows non-printable Unicode-but-not-ASCII characters as
|
||||
* their universal character names;
|
||||
*
|
||||
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
||||
* as C-style hex escapes (XXX: Does not actually do this. Some illegal
|
||||
* sequences, such as overlong encodings, the sequences reserved for
|
||||
* UTF-16 surrogate halves (paired or unpaired), and values outside
|
||||
* Unicode (i.e., the old sequences for code points above U+10FFFF)
|
||||
* will be decoded in a permissive way. Other illegal sequences,
|
||||
* such 0xFE and 0xFF and the presence of a continuation byte where
|
||||
* not expected (or vice versa its absence), are replaced with
|
||||
* REPLACEMENT CHARACTER.)
|
||||
*
|
||||
* and return a pointer to it.
|
||||
*/
|
||||
char *
|
||||
format_text(wmem_allocator_t *allocator,
|
||||
const char *string, size_t len)
|
||||
{
|
||||
return format_text_internal(allocator, string, len, FALSE);
|
||||
}
|
||||
|
||||
/** Given a wmem scope and a null-terminated string, expected to be in
|
||||
* UTF-8 but possibly containing invalid sequences (as it may have come
|
||||
* from packet data), and the length of the string, generate a valid
|
||||
* UTF-8 string from it, allocated in the specified wmem scope, that:
|
||||
*
|
||||
* shows printable Unicode characters as themselves;
|
||||
*
|
||||
* shows non-printable ASCII characters as C-style escapes (octal
|
||||
* if not one of the standard ones such as LF -> '\n');
|
||||
*
|
||||
* shows non-printable Unicode-but-not-ASCII characters as
|
||||
* their universal character names;
|
||||
*
|
||||
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
||||
* as C-style hex escapes;
|
||||
*
|
||||
* and return a pointer to it.
|
||||
*/
|
||||
char *
|
||||
format_text_string(wmem_allocator_t* allocator, const char *string)
|
||||
{
|
||||
return format_text_internal(allocator, string, strlen(string), FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a string, generate a string from it that shows non-printable
|
||||
* characters as C-style escapes except a whitespace character
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* which will be replaced by a space, and return a pointer to it.
|
||||
*/
|
||||
char *
|
||||
format_text_wsp(wmem_allocator_t* allocator, const char *string, size_t len)
|
||||
{
|
||||
return format_text_internal(allocator, string, len, TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a string, generate a string from it that shows non-printable
|
||||
* characters as the chr parameter passed, except a whitespace character
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* which will be replaced by a space, and return a pointer to it.
|
||||
*
|
||||
* This does *not* treat the input string as UTF-8.
|
||||
*
|
||||
* This is useful for displaying binary data that frequently but not always
|
||||
* contains text; otherwise the number of C escape codes makes it unreadable.
|
||||
*/
|
||||
char *
|
||||
format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, char chr)
|
||||
{
|
||||
wmem_strbuf_t *buf;
|
||||
|
||||
buf = wmem_strbuf_sized_new(allocator, len + 1, 0);
|
||||
for (const char *p = string; p < string + len; p++) {
|
||||
if (g_ascii_isprint(*p)) {
|
||||
wmem_strbuf_append_c(buf, *p);
|
||||
}
|
||||
else if (g_ascii_isspace(*p)) {
|
||||
wmem_strbuf_append_c(buf, ' ');
|
||||
}
|
||||
else {
|
||||
wmem_strbuf_append_c(buf, chr);
|
||||
}
|
||||
}
|
||||
return wmem_strbuf_finalize(buf);
|
||||
}
|
||||
|
||||
static gboolean
|
||||
is_byte_sep(guint8 c)
|
||||
{
|
||||
|
|
|
@ -47,81 +47,6 @@ WS_DLL_PUBLIC
|
|||
int get_token_len(const guchar *linep, const guchar *lineend,
|
||||
const guchar **next_token);
|
||||
|
||||
/** Given a wmem scope, a not-necessarily-null-terminated string,
|
||||
* expected to be in UTF-8 and the length of the string,
|
||||
* generate a valid UTF-8 string from it, allocated in the specified
|
||||
* wmem scope, that:
|
||||
*
|
||||
* shows printable Unicode characters as themselves;
|
||||
*
|
||||
* shows non-printable ASCII characters as C-style escapes (octal
|
||||
* if not one of the standard ones such as LF -> '\n');
|
||||
*
|
||||
* shows non-printable Unicode-but-not-ASCII characters as
|
||||
* their universal character names;
|
||||
*
|
||||
* Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
|
||||
*
|
||||
* and return a pointer to it.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param string A pointer to the input string
|
||||
* @param len The length of the input string
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
* @see tvb_format_text()
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);
|
||||
|
||||
/** Same as format_text() but accepts a nul-terminated string.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param string A pointer to the input string
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
* @see tvb_format_text()
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text_string(wmem_allocator_t* allocator, const char *string);
|
||||
|
||||
/**
|
||||
* Same as format_text() but replaces any whitespace characters
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* with a space.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param line A pointer to the input string
|
||||
* @param len The length of the input string
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);
|
||||
|
||||
/**
|
||||
* Given a string, generate a string from it that shows non-printable
|
||||
* characters as the chr parameter passed, except a whitespace character
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* which will be replaced by a space, and return a pointer to it.
|
||||
*
|
||||
* This does *not* treat the input string as UTF-8.
|
||||
*
|
||||
* This is useful for displaying binary data that frequently but not always
|
||||
* contains text; otherwise the number of C escape codes makes it unreadable.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param string A pointer to the input string
|
||||
* @param len The length of the input string
|
||||
* @param chr The character to use to replace non-printable characters
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text_chr(wmem_allocator_t *allocator,
|
||||
const char *string, size_t len, char chr);
|
||||
|
||||
|
||||
/** Turn a string of hex digits with optional separators (defined by
|
||||
* is_byte_sep() into a byte array.
|
||||
*
|
||||
|
|
|
@ -16,6 +16,10 @@
|
|||
|
||||
#include <wsutil/to_str.h>
|
||||
|
||||
|
||||
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
|
||||
|
||||
gchar *
|
||||
wmem_strconcat(wmem_allocator_t *allocator, const gchar *first, ...)
|
||||
{
|
||||
|
@ -560,6 +564,485 @@ ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len)
|
|||
return wmem_strbuf_finalize(buf);
|
||||
}
|
||||
|
||||
#define INITIAL_FMTBUF_SIZE 128
|
||||
|
||||
/*
|
||||
* Declare, and initialize, the variables used for an output buffer.
|
||||
*/
|
||||
#define FMTBUF_VARS \
|
||||
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
|
||||
guint fmtbuf_len = INITIAL_FMTBUF_SIZE; \
|
||||
guint column = 0
|
||||
|
||||
/*
|
||||
* Expand the buffer to be large enough to add nbytes bytes, plus a
|
||||
* terminating '\0'.
|
||||
*/
|
||||
#define FMTBUF_EXPAND(nbytes) \
|
||||
/* \
|
||||
* Is there enough room for those bytes and also enough room for \
|
||||
* a terminating '\0'? \
|
||||
*/ \
|
||||
if (column+(nbytes+1) >= fmtbuf_len) { \
|
||||
/* \
|
||||
* Double the buffer's size if it's not big enough. \
|
||||
* The size of the buffer starts at 128, so doubling its size \
|
||||
* adds at least another 128 bytes, which is more than enough \
|
||||
* for one more character plus a terminating '\0'. \
|
||||
*/ \
|
||||
fmtbuf_len *= 2; \
|
||||
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
|
||||
}
|
||||
|
||||
/*
|
||||
* Put a byte into the buffer; space must have been ensured for it.
|
||||
*/
|
||||
#define FMTBUF_PUTCHAR(b) \
|
||||
fmtbuf[column] = (b); \
|
||||
column++
|
||||
|
||||
/*
|
||||
* Add the one-byte argument, as an octal escape sequence, to the end
|
||||
* of the buffer.
|
||||
*/
|
||||
#define FMTBUF_PUTBYTE_OCTAL(b) \
|
||||
FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
|
||||
FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
|
||||
FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
|
||||
|
||||
/*
|
||||
* Add the one-byte argument, as a hex escape sequence, to the end
|
||||
* of the buffer.
|
||||
*/
|
||||
#define FMTBUF_PUTBYTE_HEX(b) \
|
||||
FMTBUF_PUTCHAR('\\'); \
|
||||
FMTBUF_PUTCHAR('x'); \
|
||||
FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
|
||||
FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
|
||||
|
||||
/*
|
||||
* Put the trailing '\0' at the end of the buffer.
|
||||
*/
|
||||
#define FMTBUF_ENDSTR \
|
||||
fmtbuf[column] = '\0'
|
||||
|
||||
/* REPLACEMENT CHARACTER */
|
||||
#define UNREPL 0xFFFD
|
||||
|
||||
#define UNPOOP 0x1F4A9
|
||||
|
||||
static gchar *
|
||||
format_text_internal(wmem_allocator_t *allocator,
|
||||
const guchar *string, size_t len,
|
||||
gboolean replace_space)
|
||||
{
|
||||
FMTBUF_VARS;
|
||||
const guchar *stringend = string + len;
|
||||
guchar c;
|
||||
bool is_valid_utf8 = true;
|
||||
|
||||
while (string < stringend) {
|
||||
/*
|
||||
* Get the first byte of this character.
|
||||
*/
|
||||
c = *string++;
|
||||
if (g_ascii_isprint(c)) {
|
||||
/*
|
||||
* Printable ASCII, so not part of a multi-byte UTF-8 sequence.
|
||||
* Make sure there's enough room for one more byte, and add
|
||||
* the character.
|
||||
*/
|
||||
FMTBUF_EXPAND(1);
|
||||
FMTBUF_PUTCHAR(c);
|
||||
} else if (replace_space && g_ascii_isspace(c)) {
|
||||
/*
|
||||
* ASCII, so not part of a multi-byte UTF-8 sequence, but
|
||||
* not printable, but is a space character; show it as a
|
||||
* blank.
|
||||
*
|
||||
* Make sure there's enough room for one more byte, and add
|
||||
* the blank.
|
||||
*/
|
||||
FMTBUF_EXPAND(1);
|
||||
FMTBUF_PUTCHAR(' ');
|
||||
} else if (c < 128) {
|
||||
/*
|
||||
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
|
||||
* printable.
|
||||
*
|
||||
* That requires a minimum of 2 bytes, one for the backslash
|
||||
* and one for a letter, so make sure we have enough room
|
||||
* for that, plus a trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(2);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
switch (c) {
|
||||
|
||||
case '\a':
|
||||
FMTBUF_PUTCHAR('a');
|
||||
break;
|
||||
|
||||
case '\b':
|
||||
FMTBUF_PUTCHAR('b'); /* BS */
|
||||
break;
|
||||
|
||||
case '\f':
|
||||
FMTBUF_PUTCHAR('f'); /* FF */
|
||||
break;
|
||||
|
||||
case '\n':
|
||||
FMTBUF_PUTCHAR('n'); /* NL */
|
||||
break;
|
||||
|
||||
case '\r':
|
||||
FMTBUF_PUTCHAR('r'); /* CR */
|
||||
break;
|
||||
|
||||
case '\t':
|
||||
FMTBUF_PUTCHAR('t'); /* tab */
|
||||
break;
|
||||
|
||||
case '\v':
|
||||
FMTBUF_PUTCHAR('v');
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* We've already put the backslash, but this
|
||||
* will put 3 more characters for the octal
|
||||
* number; make sure we have enough room for
|
||||
* that, plus the trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(3);
|
||||
FMTBUF_PUTBYTE_OCTAL(c);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We've fetched the first byte of a multi-byte UTF-8
|
||||
* sequence into c.
|
||||
*/
|
||||
int utf8_len;
|
||||
guchar mask;
|
||||
gunichar uc;
|
||||
guchar first;
|
||||
|
||||
if ((c & 0xe0) == 0xc0) {
|
||||
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
|
||||
utf8_len = 1;
|
||||
mask = 0x1f;
|
||||
} else if ((c & 0xf0) == 0xe0) {
|
||||
/* Starts a 3-byte UTF-8 sequence; 2 bytes left */
|
||||
utf8_len = 2;
|
||||
mask = 0x0f;
|
||||
} else if ((c & 0xf8) == 0xf0) {
|
||||
/* Starts a 4-byte UTF-8 sequence; 3 bytes left */
|
||||
utf8_len = 3;
|
||||
mask = 0x07;
|
||||
} else if ((c & 0xfc) == 0xf8) {
|
||||
/* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
|
||||
utf8_len = 4;
|
||||
mask = 0x03;
|
||||
} else if ((c & 0xfe) == 0xfc) {
|
||||
/* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
|
||||
utf8_len = 5;
|
||||
mask = 0x01;
|
||||
} else {
|
||||
/* 0xfe or 0xff or a continuation byte - not valid */
|
||||
utf8_len = -1;
|
||||
}
|
||||
if (utf8_len > 0) {
|
||||
/* Try to construct the Unicode character */
|
||||
uc = c & mask;
|
||||
for (int i = 0; i < utf8_len; i++) {
|
||||
if (string >= stringend) {
|
||||
/*
|
||||
* Ran out of octets, so the character is
|
||||
* incomplete. Put in a REPLACEMENT CHARACTER
|
||||
* instead, and then continue the loop, which
|
||||
* will terminate.
|
||||
*/
|
||||
uc = UNREPL;
|
||||
break;
|
||||
}
|
||||
c = *string;
|
||||
if ((c & 0xc0) != 0x80) {
|
||||
/*
|
||||
* Not valid UTF-8 continuation character; put in
|
||||
* a replacement character, and then re-process
|
||||
* this octet as the beginning of a new character.
|
||||
*/
|
||||
uc = UNREPL;
|
||||
break;
|
||||
}
|
||||
string++;
|
||||
uc = (uc << 6) | (c & 0x3f);
|
||||
}
|
||||
|
||||
/*
|
||||
* If this isn't a valid Unicode character, put in
|
||||
* a REPLACEMENT CHARACTER.
|
||||
*/
|
||||
if (!g_unichar_validate(uc))
|
||||
uc = UNREPL;
|
||||
} else {
|
||||
/* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
|
||||
uc = UNREPL;
|
||||
}
|
||||
|
||||
if (uc == UNREPL) {
|
||||
/* Flag this UTF-8 string as having been sanitized. */
|
||||
is_valid_utf8 = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, is it a printable Unicode character?
|
||||
*/
|
||||
if (g_unichar_isprint(uc)) {
|
||||
/*
|
||||
* Yes - put it into the string as UTF-8.
|
||||
* This means that if it was an overlong
|
||||
* encoding, this will put out the right
|
||||
* sized encoding.
|
||||
*/
|
||||
if (uc < 0x80) {
|
||||
first = 0;
|
||||
utf8_len = 1;
|
||||
} else if (uc < 0x800) {
|
||||
first = 0xc0;
|
||||
utf8_len = 2;
|
||||
} else if (uc < 0x10000) {
|
||||
first = 0xe0;
|
||||
utf8_len = 3;
|
||||
} else if (uc < 0x200000) {
|
||||
first = 0xf0;
|
||||
utf8_len = 4;
|
||||
} else if (uc < 0x4000000) {
|
||||
/*
|
||||
* This should never happen, as Unicode doesn't
|
||||
* go that high.
|
||||
*/
|
||||
first = 0xf8;
|
||||
utf8_len = 5;
|
||||
} else {
|
||||
/*
|
||||
* This should never happen, as Unicode doesn't
|
||||
* go that high.
|
||||
*/
|
||||
first = 0xfc;
|
||||
utf8_len = 6;
|
||||
}
|
||||
FMTBUF_EXPAND(utf8_len);
|
||||
for (int i = utf8_len - 1; i > 0; i--) {
|
||||
fmtbuf[column + i] = (uc & 0x3f) | 0x80;
|
||||
uc >>= 6;
|
||||
}
|
||||
fmtbuf[column] = uc | first;
|
||||
column += utf8_len;
|
||||
} else if (replace_space && g_unichar_isspace(uc)) {
|
||||
/*
|
||||
* Not printable, but is a space character; show it
|
||||
* as a blank.
|
||||
*
|
||||
* Make sure there's enough room for one more byte,
|
||||
* and add the blank.
|
||||
*/
|
||||
FMTBUF_EXPAND(1);
|
||||
FMTBUF_PUTCHAR(' ');
|
||||
} else if (c < 128) {
|
||||
/*
|
||||
* ASCII, but not printable.
|
||||
* Yes, this could happen with an overlong encoding.
|
||||
*
|
||||
* That requires a minimum of 2 bytes, one for the
|
||||
* backslash and one for a letter, so make sure we
|
||||
* have enough room for that, plus a trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(2);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
switch (c) {
|
||||
|
||||
case '\a':
|
||||
FMTBUF_PUTCHAR('a');
|
||||
break;
|
||||
|
||||
case '\b':
|
||||
FMTBUF_PUTCHAR('b'); /* BS */
|
||||
break;
|
||||
|
||||
case '\f':
|
||||
FMTBUF_PUTCHAR('f'); /* FF */
|
||||
break;
|
||||
|
||||
case '\n':
|
||||
FMTBUF_PUTCHAR('n'); /* NL */
|
||||
break;
|
||||
|
||||
case '\r':
|
||||
FMTBUF_PUTCHAR('r'); /* CR */
|
||||
break;
|
||||
|
||||
case '\t':
|
||||
FMTBUF_PUTCHAR('t'); /* tab */
|
||||
break;
|
||||
|
||||
case '\v':
|
||||
FMTBUF_PUTCHAR('v');
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* We've already put the backslash, but this
|
||||
* will put 3 more characters for the octal
|
||||
* number; make sure we have enough room for
|
||||
* that, plus the trailing '\0'.
|
||||
*/
|
||||
FMTBUF_EXPAND(3);
|
||||
FMTBUF_PUTBYTE_OCTAL(c);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* Unicode, but not printable, and not ASCII;
|
||||
* put it out as \uxxxx or \Uxxxxxxxx.
|
||||
*/
|
||||
if (uc <= 0xFFFF) {
|
||||
FMTBUF_EXPAND(6);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
FMTBUF_PUTCHAR('u');
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||
} else {
|
||||
FMTBUF_EXPAND(10);
|
||||
FMTBUF_PUTCHAR('\\');
|
||||
FMTBUF_PUTCHAR('U');
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FMTBUF_ENDSTR;
|
||||
|
||||
if (!is_valid_utf8) {
|
||||
/* This function expects valid UTF-8 as input. The extra validation performed is a safeguard.
|
||||
* In a brighter future it may be removed. Emit a warning and display the sanitized string. */
|
||||
ws_log_full(LOG_DOMAIN_UTF_8, LOG_LEVEL_DEBUG, __FILE__, -1, __func__,
|
||||
"String argument contained UTF-8 errors: %s", fmtbuf);
|
||||
}
|
||||
return fmtbuf;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a wmem scope, a not-necessarily-null-terminated string,
|
||||
* expected to be in UTF-8 but possibly containing invalid sequences
|
||||
* (as it may have come from packet data), and the length of the string,
|
||||
* generate a valid UTF-8 string from it, allocated in the specified
|
||||
* wmem scope, that:
|
||||
*
|
||||
* shows printable Unicode characters as themselves;
|
||||
*
|
||||
* shows non-printable ASCII characters as C-style escapes (octal
|
||||
* if not one of the standard ones such as LF -> '\n');
|
||||
*
|
||||
* shows non-printable Unicode-but-not-ASCII characters as
|
||||
* their universal character names;
|
||||
*
|
||||
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
||||
* as C-style hex escapes (XXX: Does not actually do this. Some illegal
|
||||
* sequences, such as overlong encodings, the sequences reserved for
|
||||
* UTF-16 surrogate halves (paired or unpaired), and values outside
|
||||
* Unicode (i.e., the old sequences for code points above U+10FFFF)
|
||||
* will be decoded in a permissive way. Other illegal sequences,
|
||||
* such 0xFE and 0xFF and the presence of a continuation byte where
|
||||
* not expected (or vice versa its absence), are replaced with
|
||||
* REPLACEMENT CHARACTER.)
|
||||
*
|
||||
* and return a pointer to it.
|
||||
*/
|
||||
char *
|
||||
format_text(wmem_allocator_t *allocator,
|
||||
const char *string, size_t len)
|
||||
{
|
||||
return format_text_internal(allocator, string, len, FALSE);
|
||||
}
|
||||
|
||||
/** Given a wmem scope and a null-terminated string, expected to be in
|
||||
* UTF-8 but possibly containing invalid sequences (as it may have come
|
||||
* from packet data), and the length of the string, generate a valid
|
||||
* UTF-8 string from it, allocated in the specified wmem scope, that:
|
||||
*
|
||||
* shows printable Unicode characters as themselves;
|
||||
*
|
||||
* shows non-printable ASCII characters as C-style escapes (octal
|
||||
* if not one of the standard ones such as LF -> '\n');
|
||||
*
|
||||
* shows non-printable Unicode-but-not-ASCII characters as
|
||||
* their universal character names;
|
||||
*
|
||||
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
||||
* as C-style hex escapes;
|
||||
*
|
||||
* and return a pointer to it.
|
||||
*/
|
||||
char *
|
||||
format_text_string(wmem_allocator_t* allocator, const char *string)
|
||||
{
|
||||
return format_text_internal(allocator, string, strlen(string), FALSE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a string, generate a string from it that shows non-printable
|
||||
* characters as C-style escapes except a whitespace character
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* which will be replaced by a space, and return a pointer to it.
|
||||
*/
|
||||
char *
|
||||
format_text_wsp(wmem_allocator_t* allocator, const char *string, size_t len)
|
||||
{
|
||||
return format_text_internal(allocator, string, len, TRUE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a string, generate a string from it that shows non-printable
|
||||
* characters as the chr parameter passed, except a whitespace character
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* which will be replaced by a space, and return a pointer to it.
|
||||
*
|
||||
* This does *not* treat the input string as UTF-8.
|
||||
*
|
||||
* This is useful for displaying binary data that frequently but not always
|
||||
* contains text; otherwise the number of C escape codes makes it unreadable.
|
||||
*/
|
||||
char *
|
||||
format_text_chr(wmem_allocator_t *allocator, const char *string, size_t len, char chr)
|
||||
{
|
||||
wmem_strbuf_t *buf;
|
||||
|
||||
buf = wmem_strbuf_sized_new(allocator, len + 1, 0);
|
||||
for (const char *p = string; p < string + len; p++) {
|
||||
if (g_ascii_isprint(*p)) {
|
||||
wmem_strbuf_append_c(buf, *p);
|
||||
}
|
||||
else if (g_ascii_isspace(*p)) {
|
||||
wmem_strbuf_append_c(buf, ' ');
|
||||
}
|
||||
else {
|
||||
wmem_strbuf_append_c(buf, chr);
|
||||
}
|
||||
}
|
||||
return wmem_strbuf_finalize(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Editor modelines - https://www.wireshark.org/tools/modelines.html
|
||||
*
|
||||
|
|
|
@ -213,6 +213,80 @@ const char *ws_strerrorname_r(int errnum, char *buf, size_t buf_size);
|
|||
WS_DLL_PUBLIC
|
||||
char *ws_strdup_underline(wmem_allocator_t *allocator, long offset, size_t len);
|
||||
|
||||
/** Given a wmem scope, a not-necessarily-null-terminated string,
|
||||
* expected to be in UTF-8 and the length of the string,
|
||||
* generate a valid UTF-8 string from it, allocated in the specified
|
||||
* wmem scope, that:
|
||||
*
|
||||
* shows printable Unicode characters as themselves;
|
||||
*
|
||||
* shows non-printable ASCII characters as C-style escapes (octal
|
||||
* if not one of the standard ones such as LF -> '\n');
|
||||
*
|
||||
* shows non-printable Unicode-but-not-ASCII characters as
|
||||
* their universal character names;
|
||||
*
|
||||
* Replaces illegal UTF-8 sequences with U+FFFD (replacement character) ;
|
||||
*
|
||||
* and return a pointer to it.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param string A pointer to the input string
|
||||
* @param len The length of the input string
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
* @see tvb_format_text()
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text(wmem_allocator_t* allocator, const char *string, size_t len);
|
||||
|
||||
/** Same as format_text() but accepts a nul-terminated string.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param string A pointer to the input string
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
* @see tvb_format_text()
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text_string(wmem_allocator_t* allocator, const char *string);
|
||||
|
||||
/**
|
||||
* Same as format_text() but replaces any whitespace characters
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* with a space.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param line A pointer to the input string
|
||||
* @param len The length of the input string
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text_wsp(wmem_allocator_t* allocator, const char *line, size_t len);
|
||||
|
||||
/**
|
||||
* Given a string, generate a string from it that shows non-printable
|
||||
* characters as the chr parameter passed, except a whitespace character
|
||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||
* which will be replaced by a space, and return a pointer to it.
|
||||
*
|
||||
* This does *not* treat the input string as UTF-8.
|
||||
*
|
||||
* This is useful for displaying binary data that frequently but not always
|
||||
* contains text; otherwise the number of C escape codes makes it unreadable.
|
||||
*
|
||||
* @param allocator The wmem scope
|
||||
* @param string A pointer to the input string
|
||||
* @param len The length of the input string
|
||||
* @param chr The character to use to replace non-printable characters
|
||||
* @return A pointer to the formatted string
|
||||
*
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
char *format_text_chr(wmem_allocator_t *allocator,
|
||||
const char *string, size_t len, char chr);
|
||||
|
||||
/* To pass one of two strings, singular or plural */
|
||||
#define plurality(d,s,p) ((d) == 1 ? (s) : (p))
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include <stdio.h>
|
||||
#include <glib.h>
|
||||
#include <wsutil/utf8_entities.h>
|
||||
#include <wsutil/time_util.h>
|
||||
|
||||
#include "inet_addr.h"
|
||||
|
||||
|
@ -185,6 +186,74 @@ static void test_str_ascii(void)
|
|||
wmem_destroy_allocator(allocator);
|
||||
}
|
||||
|
||||
static void test_format_text(void)
|
||||
{
|
||||
const char *have, *want;
|
||||
char *res;
|
||||
|
||||
/* ASCII */
|
||||
have = "abcdef";
|
||||
want = "abcdef";
|
||||
res = format_text_string(NULL, have);
|
||||
g_assert_cmpstr(res, ==, want);
|
||||
g_free(res);
|
||||
|
||||
/* ASCII with special escape characters. */
|
||||
have = "abc\td\fe\nf";
|
||||
want = "abc\\td\\fe\\nf";
|
||||
res = format_text_string(NULL, have);
|
||||
g_assert_cmpstr(res, ==, want);
|
||||
g_free(res);
|
||||
|
||||
/* ASCII with non-printable characters. */
|
||||
have = "abc \004 def";
|
||||
want = "abc \\004 def";
|
||||
res = format_text_string(NULL, have);
|
||||
g_assert_cmpstr(res, ==, want);
|
||||
g_free(res);
|
||||
|
||||
/* UTF-8 */
|
||||
have = u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο";
|
||||
want = u8"Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο";
|
||||
res = format_text_string(NULL, have);
|
||||
g_assert_cmpstr(res, ==, want);
|
||||
g_free(res);
|
||||
|
||||
/* UTF-8 with non-ASCII non-printable characters. */
|
||||
have = u8"String with BOM \ufeff";
|
||||
want = u8"String with BOM \\uFEFF";
|
||||
res = format_text_string(NULL, have);
|
||||
g_assert_cmpstr(res, ==, want);
|
||||
g_free(res);
|
||||
|
||||
}
|
||||
|
||||
#define RESOURCE_USAGE_START get_resource_usage(&start_utime, &start_stime)
|
||||
|
||||
#define RESOURCE_USAGE_END \
|
||||
get_resource_usage(&end_utime, &end_stime); \
|
||||
utime_ms = (end_utime - start_utime) * 1000.0; \
|
||||
stime_ms = (end_stime - start_stime) * 1000.0
|
||||
|
||||
static void test_format_text_perf(void)
|
||||
{
|
||||
#define LOOP_COUNT (1 * 1000 * 1000)
|
||||
char *str;
|
||||
int i;
|
||||
double start_utime, start_stime, end_utime, end_stime, utime_ms, stime_ms;
|
||||
|
||||
const char *text = "The quick brown fox\tjumps over the lazy \001dog"UTF8_HORIZONTAL_ELLIPSIS"\n";
|
||||
|
||||
RESOURCE_USAGE_START;
|
||||
for (i = 0; i < LOOP_COUNT; i++) {
|
||||
str = format_text_string(NULL, text);
|
||||
g_free(str);
|
||||
}
|
||||
RESOURCE_USAGE_END;
|
||||
g_test_minimized_result(utime_ms + stime_ms,
|
||||
"format_text_string(): u %.3f ms s %.3f ms", utime_ms, stime_ms);
|
||||
}
|
||||
|
||||
#include "to_str.h"
|
||||
|
||||
static void test_word_to_hex(void)
|
||||
|
@ -743,6 +812,8 @@ int main(int argc, char **argv)
|
|||
{
|
||||
int ret;
|
||||
|
||||
ws_log_init("test_wsutil", NULL);
|
||||
|
||||
g_test_init(&argc, &argv, NULL);
|
||||
|
||||
g_test_add_func("/inet_addr/inet_pton4", test_inet_pton4_test1);
|
||||
|
@ -755,6 +826,11 @@ int main(int argc, char **argv)
|
|||
g_test_add_func("/str_util/strconcat", test_strconcat);
|
||||
g_test_add_func("/str_util/strsplit", test_strsplit);
|
||||
g_test_add_func("/str_util/str_ascii", test_str_ascii);
|
||||
g_test_add_func("/str_util/format_text", test_format_text);
|
||||
|
||||
if (g_test_perf()) {
|
||||
g_test_add_func("/str_util/format_text_perf", test_format_text_perf);
|
||||
}
|
||||
|
||||
g_test_add_func("/to_str/word_to_hex", test_word_to_hex);
|
||||
g_test_add_func("/to_str/bytes_to_str", test_bytes_to_str);
|
||||
|
|
Loading…
Reference in New Issue