Don't format printable non-ASCII Unicode characters as escape sequences.
Note that even strings fetched with ENC_ASCII may contain them - bytes with the 8th bit set get mapped to REPLACEMENT CHARACTER. This means we can format STR_UNICODE fields with format_text(); do so. Bug: 1372 Change-Id: Ia32c3a92d220ac5174ecd25f33e2d1f85cfb8cb8 Reviewed-on: https://code.wireshark.org/review/34080 Reviewed-by: Guy Harris <guy@alum.mit.edu>
This commit is contained in:
parent
dd5f2bd054
commit
edd5eaa57e
|
@ -1016,8 +1016,7 @@ hfinfo_format_text(wmem_allocator_t *scope, const header_field_info *hfinfo,
|
||||||
return format_text_wsp(string, strlen(string));
|
return format_text_wsp(string, strlen(string));
|
||||||
*/
|
*/
|
||||||
case STR_UNICODE:
|
case STR_UNICODE:
|
||||||
/* XXX, format_unicode_text() */
|
return format_text(scope, string, strlen(string));
|
||||||
return wmem_strdup(scope, string);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return format_text(scope, string, strlen(string));
|
return format_text(scope, string, strlen(string));
|
||||||
|
|
742
epan/strutil.c
742
epan/strutil.c
|
@ -128,100 +128,364 @@ get_token_len(const guchar *linep, const guchar *lineend,
|
||||||
#define INITIAL_FMTBUF_SIZE 128
|
#define INITIAL_FMTBUF_SIZE 128
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Given a string, generate a string from it that shows non-printable
|
* Declare, and initialize, the variables used for an output buffer.
|
||||||
* characters as C-style escapes, and return a pointer to it.
|
*/
|
||||||
|
#define FMTBUF_VARS \
|
||||||
|
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE); \
|
||||||
|
guint fmtbuf_len = INITIAL_FMTBUF_SIZE; \
|
||||||
|
guint column = 0
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Expand the buffer to be large enough to add nbytes bytes, plus a
|
||||||
|
* terminating '\0'.
|
||||||
|
*/
|
||||||
|
#define FMTBUF_EXPAND(nbytes) \
|
||||||
|
/* \
|
||||||
|
* Is there enough room for those bytes and also enough room for \
|
||||||
|
* a terminating '\0'? \
|
||||||
|
*/ \
|
||||||
|
if (column+(nbytes+1) >= fmtbuf_len) { \
|
||||||
|
/* \
|
||||||
|
* Double the buffer's size if it's not big enough. \
|
||||||
|
* The size of the buffer starts at 128, so doubling its size \
|
||||||
|
* adds at least another 128 bytes, which is more than enough \
|
||||||
|
* for one more character plus a terminating '\0'. \
|
||||||
|
*/ \
|
||||||
|
fmtbuf_len *= 2; \
|
||||||
|
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Put a byte into the buffer; space must have been ensured for it.
|
||||||
|
*/
|
||||||
|
#define FMTBUF_PUTCHAR(b) \
|
||||||
|
fmtbuf[column] = (b); \
|
||||||
|
column++
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the one-byte argument, as an octal escape sequence, to the end
|
||||||
|
* of the buffer.
|
||||||
|
*/
|
||||||
|
#define FMTBUF_PUTBYTE_OCTAL(b) \
|
||||||
|
FMTBUF_PUTCHAR((((b)>>6)&03) + '0'); \
|
||||||
|
FMTBUF_PUTCHAR((((b)>>3)&07) + '0'); \
|
||||||
|
FMTBUF_PUTCHAR((((b)>>0)&07) + '0')
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Add the one-byte argument, as a hex escape sequence, to the end
|
||||||
|
* of the buffer.
|
||||||
|
*/
|
||||||
|
#define FMTBUF_PUTBYTE_HEX(b) \
|
||||||
|
FMTBUF_PUTCHAR('\\'); \
|
||||||
|
FMTBUF_PUTCHAR('x'); \
|
||||||
|
FMTBUF_PUTCHAR(hex[((b) >> 4) & 0xF]); \
|
||||||
|
FMTBUF_PUTCHAR(hex[((b) >> 0) & 0xF])
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Put the trailing '\0' at the end of the buffer.
|
||||||
|
*/
|
||||||
|
#define FMTBUF_ENDSTR \
|
||||||
|
fmtbuf[column] = '\0'
|
||||||
|
|
||||||
|
/* REPLACEMENT CHARACTER */
|
||||||
|
#define UNREPL 0xFFFD
|
||||||
|
|
||||||
|
#define UNPOOP 0x1F4A9
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Given a string, expected to be in UTF-8 but possibly containing
|
||||||
|
* invalid sequences (as it may have come from packet data), generate
|
||||||
|
* a valid UTF-8 string from it, allocated with the specified wmem
|
||||||
|
* allocator, that:
|
||||||
|
*
|
||||||
|
* shows printable Unicode characters as themselves;
|
||||||
|
*
|
||||||
|
* shows non-printable ASCII characters as C-style escapes (octal
|
||||||
|
* if not one of the standard ones such as LF -> '\n');
|
||||||
|
*
|
||||||
|
* shows non-printable Unicode-but-not-ASCII characters as
|
||||||
|
* their universal character names;
|
||||||
|
*
|
||||||
|
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
||||||
|
* as C-style hex escapes;
|
||||||
|
*
|
||||||
|
* and return a pointer to it.
|
||||||
*/
|
*/
|
||||||
gchar *
|
gchar *
|
||||||
format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
||||||
{
|
{
|
||||||
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
|
FMTBUF_VARS;
|
||||||
int fmtbuf_len = INITIAL_FMTBUF_SIZE;
|
|
||||||
int column;
|
|
||||||
const guchar *stringend = string + len;
|
const guchar *stringend = string + len;
|
||||||
guchar c;
|
guchar c;
|
||||||
int i;
|
|
||||||
|
|
||||||
column = 0;
|
|
||||||
while (string < stringend) {
|
while (string < stringend) {
|
||||||
/*
|
/*
|
||||||
* Is there enough room for this character, if it expands to
|
* Get the first byte of this character.
|
||||||
* a backslash plus 3 octal digits (which is the most it can
|
|
||||||
* expand to), and also enough room for a terminating '\0'?
|
|
||||||
*/
|
*/
|
||||||
if (column+3+1 >= fmtbuf_len) {
|
|
||||||
/*
|
|
||||||
* Double the buffer's size if it's not big enough.
|
|
||||||
* The size of the buffer starts at 128, so doubling its size
|
|
||||||
* adds at least another 128 bytes, which is more than enough
|
|
||||||
* for one more character plus a terminating '\0'.
|
|
||||||
*/
|
|
||||||
fmtbuf_len *= 2;
|
|
||||||
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
|
|
||||||
}
|
|
||||||
c = *string++;
|
c = *string++;
|
||||||
|
|
||||||
if (g_ascii_isprint(c)) {
|
if (g_ascii_isprint(c)) {
|
||||||
fmtbuf[column] = c;
|
/*
|
||||||
column++;
|
* Printable ASCII, so not part of a multi-byte UTF-8 sequence.
|
||||||
} else {
|
* Make sure there's enough room for one more byte, and add
|
||||||
fmtbuf[column] = '\\';
|
* the character.
|
||||||
column++;
|
*/
|
||||||
|
FMTBUF_EXPAND(1);
|
||||||
|
FMTBUF_PUTCHAR(c);
|
||||||
|
} else if (c < 128) {
|
||||||
|
/*
|
||||||
|
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
|
||||||
|
* printable.
|
||||||
|
*
|
||||||
|
* That requires a minimum of 2 bytes, one for the backslash
|
||||||
|
* and one for a letter, so make sure we have enough room
|
||||||
|
* for that, plus a trailing '\0'.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(2);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
|
||||||
case '\a':
|
case '\a':
|
||||||
fmtbuf[column] = 'a';
|
FMTBUF_PUTCHAR('a');
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\b':
|
case '\b':
|
||||||
fmtbuf[column] = 'b'; /* BS */
|
FMTBUF_PUTCHAR('b'); /* BS */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\f':
|
case '\f':
|
||||||
fmtbuf[column] = 'f'; /* FF */
|
FMTBUF_PUTCHAR('f'); /* FF */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\n':
|
case '\n':
|
||||||
fmtbuf[column] = 'n'; /* NL */
|
FMTBUF_PUTCHAR('n'); /* NL */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\r':
|
case '\r':
|
||||||
fmtbuf[column] = 'r'; /* CR */
|
FMTBUF_PUTCHAR('r'); /* CR */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\t':
|
case '\t':
|
||||||
fmtbuf[column] = 't'; /* tab */
|
FMTBUF_PUTCHAR('t'); /* tab */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\v':
|
case '\v':
|
||||||
fmtbuf[column] = 'v';
|
FMTBUF_PUTCHAR('v');
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
i = (c>>6)&03;
|
/*
|
||||||
fmtbuf[column] = i + '0';
|
* We've already put the backslash, but this
|
||||||
column++;
|
* will put 3 more characters for the octal
|
||||||
i = (c>>3)&07;
|
* number; make sure we have enough room for
|
||||||
fmtbuf[column] = i + '0';
|
* that, plus the trailing '\0'.
|
||||||
column++;
|
*/
|
||||||
i = (c>>0)&07;
|
FMTBUF_EXPAND(3);
|
||||||
fmtbuf[column] = i + '0';
|
FMTBUF_PUTBYTE_OCTAL(c);
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* We've fetched the first byte of a multi-byte UTF-8
|
||||||
|
* sequence into c.
|
||||||
|
*/
|
||||||
|
int utf8_len;
|
||||||
|
guchar mask;
|
||||||
|
gunichar uc;
|
||||||
|
guchar first;
|
||||||
|
|
||||||
|
if ((c & 0xe8) == 0xc0) {
|
||||||
|
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
|
||||||
|
utf8_len = 1;
|
||||||
|
mask = 0x1f;
|
||||||
|
} else if ((c & 0xf0) == 0xe0) {
|
||||||
|
/* Starts a 3-byte UTF-8 sequence; 2 bytes left */
|
||||||
|
utf8_len = 2;
|
||||||
|
mask = 0x0f;
|
||||||
|
} else if ((c & 0xf8) == 0xf0) {
|
||||||
|
/* Starts a 4-byte UTF-8 sequence; 3 bytes left */
|
||||||
|
utf8_len = 3;
|
||||||
|
mask = 0x07;
|
||||||
|
} else if ((c & 0xfc) == 0xf8) {
|
||||||
|
/* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
|
||||||
|
utf8_len = 4;
|
||||||
|
mask = 0x03;
|
||||||
|
} else if ((c & 0xfe) == 0xfc) {
|
||||||
|
/* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
|
||||||
|
utf8_len = 5;
|
||||||
|
mask = 0x01;
|
||||||
|
} else {
|
||||||
|
/* 0xfe or 0xff - not valid */
|
||||||
|
utf8_len = -1;
|
||||||
|
}
|
||||||
|
if (utf8_len > 0) {
|
||||||
|
/* Try to construct the Unicode character */
|
||||||
|
uc = c & mask;
|
||||||
|
for (int i = 0; i < utf8_len; i++) {
|
||||||
|
if (string >= stringend) {
|
||||||
|
/*
|
||||||
|
* Ran out of octets, so the character is
|
||||||
|
* incomplete. Put in a REPLACEMENT CHARACTER
|
||||||
|
* instead, and then continue the loop, which
|
||||||
|
* will terminate.
|
||||||
|
*/
|
||||||
|
uc = UNREPL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
c = *string;
|
||||||
|
if ((c & 0xc0) != 0x80) {
|
||||||
|
/*
|
||||||
|
* Not valid UTF-8 continuation character; put in
|
||||||
|
* a replacement character, and then re-process
|
||||||
|
* this octet as the beginning of a new character.
|
||||||
|
*/
|
||||||
|
uc = UNREPL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
string++;
|
||||||
|
uc = (uc << 6) | (c & 0x3f);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this isn't a valid Unicode character, put in
|
||||||
|
* a REPLACEMENT CHARACTER.
|
||||||
|
*/
|
||||||
|
if (!g_unichar_validate(uc))
|
||||||
|
uc = UNREPL;
|
||||||
|
} else {
|
||||||
|
/* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
|
||||||
|
uc = UNREPL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* OK, is it a printable Unicode character?
|
||||||
|
*/
|
||||||
|
if (g_unichar_isprint(uc)) {
|
||||||
|
/*
|
||||||
|
* Yes - put it into the string as UTF-8.
|
||||||
|
* This means that if it was an overlong
|
||||||
|
* encoding, this will put out the right
|
||||||
|
* sized encoding.
|
||||||
|
*/
|
||||||
|
if (uc < 0x80) {
|
||||||
|
first = 0;
|
||||||
|
utf8_len = 1;
|
||||||
|
} else if (uc < 0x800) {
|
||||||
|
first = 0xc0;
|
||||||
|
utf8_len = 2;
|
||||||
|
} else if (uc < 0x10000) {
|
||||||
|
first = 0xe0;
|
||||||
|
utf8_len = 3;
|
||||||
|
} else if (uc < 0x200000) {
|
||||||
|
first = 0xf0;
|
||||||
|
utf8_len = 4;
|
||||||
|
} else if (uc < 0x4000000) {
|
||||||
|
/*
|
||||||
|
* This should never happen, as Unicode doesn't
|
||||||
|
* go that high.
|
||||||
|
*/
|
||||||
|
first = 0xf8;
|
||||||
|
utf8_len = 5;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* This should never happen, as Unicode doesn't
|
||||||
|
* go that high.
|
||||||
|
*/
|
||||||
|
first = 0xfc;
|
||||||
|
utf8_len = 6;
|
||||||
|
}
|
||||||
|
FMTBUF_EXPAND(utf8_len);
|
||||||
|
for (int i = utf8_len - 1; i > 0; i--) {
|
||||||
|
fmtbuf[column + i] = (uc & 0x3f) | 0x80;
|
||||||
|
uc >>= 6;
|
||||||
|
}
|
||||||
|
fmtbuf[column] = uc | first;
|
||||||
|
column += utf8_len;
|
||||||
|
} else if (c < 128) {
|
||||||
|
/*
|
||||||
|
* ASCII, but not printable.
|
||||||
|
* Yes, this could happen with an overlong encoding.
|
||||||
|
*
|
||||||
|
* That requires a minimum of 2 bytes, one for the
|
||||||
|
* backslash and one for a letter, so make sure we
|
||||||
|
* have enough room for that, plus a trailing '\0'.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(2);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
|
switch (c) {
|
||||||
|
|
||||||
|
case '\a':
|
||||||
|
FMTBUF_PUTCHAR('a');
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\b':
|
||||||
|
FMTBUF_PUTCHAR('b'); /* BS */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\f':
|
||||||
|
FMTBUF_PUTCHAR('f'); /* FF */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\n':
|
||||||
|
FMTBUF_PUTCHAR('n'); /* NL */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\r':
|
||||||
|
FMTBUF_PUTCHAR('r'); /* CR */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\t':
|
||||||
|
FMTBUF_PUTCHAR('t'); /* tab */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\v':
|
||||||
|
FMTBUF_PUTCHAR('v');
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
/*
|
||||||
|
* We've already put the backslash, but this
|
||||||
|
* will put 3 more characters for the octal
|
||||||
|
* number; make sure we have enough room for
|
||||||
|
* that, plus the trailing '\0'.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(3);
|
||||||
|
FMTBUF_PUTBYTE_OCTAL(c);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Unicode, but not printable, and not ASCII;
|
||||||
|
* put it out as \uxxxx or \Uxxxxxxxx.
|
||||||
|
*/
|
||||||
|
if (uc <= 0xFFFF) {
|
||||||
|
FMTBUF_EXPAND(6);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
|
FMTBUF_PUTCHAR('u');
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||||
|
} else {
|
||||||
|
FMTBUF_EXPAND(10);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
|
FMTBUF_PUTCHAR('U');
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmtbuf[column] = '\0';
|
|
||||||
|
FMTBUF_ENDSTR;
|
||||||
return fmtbuf;
|
return fmtbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Given a string, generate a string from it that shows non-printable
|
* Given a string, generate a string from it that shows non-printable
|
||||||
* characters as C-style escapes except a whitespace character
|
* characters as C-style escapes except a whitespace character
|
||||||
|
@ -231,93 +495,296 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
||||||
gchar *
|
gchar *
|
||||||
format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
||||||
{
|
{
|
||||||
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
|
FMTBUF_VARS;
|
||||||
int fmtbuf_len = INITIAL_FMTBUF_SIZE;
|
|
||||||
int column;
|
|
||||||
const guchar *stringend = string + len;
|
const guchar *stringend = string + len;
|
||||||
guchar c;
|
guchar c;
|
||||||
int i;
|
|
||||||
|
|
||||||
column = 0;
|
|
||||||
while (string < stringend) {
|
while (string < stringend) {
|
||||||
/*
|
/*
|
||||||
* Is there enough room for this character, if it expands to
|
* Get the first byte of this character.
|
||||||
* a backslash plus 3 octal digits (which is the most it can
|
|
||||||
* expand to), and also enough room for a terminating '\0'?
|
|
||||||
*/
|
*/
|
||||||
if (column+3+1 >= fmtbuf_len) {
|
|
||||||
/*
|
|
||||||
* Double the buffer's size if it's not big enough.
|
|
||||||
* The size of the buffer starts at 128, so doubling its size
|
|
||||||
* adds at least another 128 bytes, which is more than enough
|
|
||||||
* for one more character plus a terminating '\0'.
|
|
||||||
*/
|
|
||||||
fmtbuf_len *= 2;
|
|
||||||
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
|
|
||||||
}
|
|
||||||
c = *string++;
|
c = *string++;
|
||||||
|
|
||||||
if (g_ascii_isprint(c)) {
|
if (g_ascii_isprint(c)) {
|
||||||
fmtbuf[column] = c;
|
/*
|
||||||
column++;
|
* Printable ASCII, so not part of a multi-byte UTF-8 sequence.
|
||||||
|
* Make sure there's enough room for one more byte, and add
|
||||||
|
* the character.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(1);
|
||||||
|
FMTBUF_PUTCHAR(c);
|
||||||
} else if (g_ascii_isspace(c)) {
|
} else if (g_ascii_isspace(c)) {
|
||||||
fmtbuf[column] = ' ';
|
/*
|
||||||
column++;
|
* ASCII, so not part of a multi-byte UTF-8 sequence, but
|
||||||
} else {
|
* not printable, but is a space character; show it as a
|
||||||
fmtbuf[column] = '\\';
|
* blank.
|
||||||
column++;
|
*
|
||||||
|
* Make sure there's enough room for one more byte, and add
|
||||||
|
* the blank.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(1);
|
||||||
|
FMTBUF_PUTCHAR(' ');
|
||||||
|
} else if (c < 128) {
|
||||||
|
/*
|
||||||
|
* ASCII, so not part of a multi-byte UTF-8 sequence, but not
|
||||||
|
* printable.
|
||||||
|
*
|
||||||
|
* That requires a minimum of 2 bytes, one for the backslash
|
||||||
|
* and one for a letter, so make sure we have enough room
|
||||||
|
* for that, plus a trailing '\0'.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(2);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
|
||||||
case '\a':
|
case '\a':
|
||||||
fmtbuf[column] = 'a';
|
FMTBUF_PUTCHAR('a');
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\b':
|
case '\b':
|
||||||
fmtbuf[column] = 'b'; /* BS */
|
FMTBUF_PUTCHAR('b'); /* BS */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\f':
|
case '\f':
|
||||||
fmtbuf[column] = 'f'; /* FF */
|
FMTBUF_PUTCHAR('f'); /* FF */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\n':
|
case '\n':
|
||||||
fmtbuf[column] = 'n'; /* NL */
|
FMTBUF_PUTCHAR('n'); /* NL */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\r':
|
case '\r':
|
||||||
fmtbuf[column] = 'r'; /* CR */
|
FMTBUF_PUTCHAR('r'); /* CR */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\t':
|
case '\t':
|
||||||
fmtbuf[column] = 't'; /* tab */
|
FMTBUF_PUTCHAR('t'); /* tab */
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case '\v':
|
case '\v':
|
||||||
fmtbuf[column] = 'v';
|
FMTBUF_PUTCHAR('v');
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
i = (c>>6)&03;
|
/*
|
||||||
fmtbuf[column] = i + '0';
|
* We've already put the backslash, but this
|
||||||
column++;
|
* will put 3 more characters for the octal
|
||||||
i = (c>>3)&07;
|
* number; make sure we have enough room for
|
||||||
fmtbuf[column] = i + '0';
|
* that, plus the trailing '\0'.
|
||||||
column++;
|
*/
|
||||||
i = (c>>0)&07;
|
FMTBUF_EXPAND(3);
|
||||||
fmtbuf[column] = i + '0';
|
FMTBUF_PUTBYTE_OCTAL(c);
|
||||||
column++;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* We've fetched the first byte of a multi-byte UTF-8
|
||||||
|
* sequence into c.
|
||||||
|
*/
|
||||||
|
int utf8_len;
|
||||||
|
guchar mask;
|
||||||
|
gunichar uc;
|
||||||
|
guchar first;
|
||||||
|
|
||||||
|
if ((c & 0xe8) == 0xc0) {
|
||||||
|
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
|
||||||
|
utf8_len = 1;
|
||||||
|
mask = 0x1f;
|
||||||
|
} else if ((c & 0xf0) == 0xe0) {
|
||||||
|
/* Starts a 3-byte UTF-8 sequence; 2 bytes left */
|
||||||
|
utf8_len = 2;
|
||||||
|
mask = 0x0f;
|
||||||
|
} else if ((c & 0xf8) == 0xf0) {
|
||||||
|
/* Starts a 4-byte UTF-8 sequence; 3 bytes left */
|
||||||
|
utf8_len = 3;
|
||||||
|
mask = 0x07;
|
||||||
|
} else if ((c & 0xfc) == 0xf8) {
|
||||||
|
/* Starts an old-style 5-byte UTF-8 sequence; 4 bytes left */
|
||||||
|
utf8_len = 4;
|
||||||
|
mask = 0x03;
|
||||||
|
} else if ((c & 0xfe) == 0xfc) {
|
||||||
|
/* Starts an old-style 6-byte UTF-8 sequence; 5 bytes left */
|
||||||
|
utf8_len = 5;
|
||||||
|
mask = 0x01;
|
||||||
|
} else {
|
||||||
|
/* 0xfe or 0xff - not valid */
|
||||||
|
utf8_len = -1;
|
||||||
|
}
|
||||||
|
if (utf8_len > 0) {
|
||||||
|
/* Try to construct the Unicode character */
|
||||||
|
uc = c & mask;
|
||||||
|
for (int i = 0; i < utf8_len; i++) {
|
||||||
|
if (string >= stringend) {
|
||||||
|
/*
|
||||||
|
* Ran out of octets, so the character is
|
||||||
|
* incomplete. Put in a REPLACEMENT CHARACTER
|
||||||
|
* instead, and then continue the loop, which
|
||||||
|
* will terminate.
|
||||||
|
*/
|
||||||
|
uc = UNREPL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
c = *string;
|
||||||
|
if ((c & 0xc0) != 0x80) {
|
||||||
|
/*
|
||||||
|
* Not valid UTF-8 continuation character; put in
|
||||||
|
* a replacement character, and then re-process
|
||||||
|
* this octet as the beginning of a new character.
|
||||||
|
*/
|
||||||
|
uc = UNREPL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
string++;
|
||||||
|
uc = (uc << 6) | (c & 0x3f);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this isn't a valid Unicode character, put in
|
||||||
|
* a REPLACEMENT CHARACTER.
|
||||||
|
*/
|
||||||
|
if (!g_unichar_validate(uc))
|
||||||
|
uc = UNREPL;
|
||||||
|
} else {
|
||||||
|
/* 0xfe or 0xff; put it a REPLACEMENT CHARACTER */
|
||||||
|
uc = UNREPL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* OK, is it a printable Unicode character?
|
||||||
|
*/
|
||||||
|
if (g_unichar_isprint(uc)) {
|
||||||
|
/*
|
||||||
|
* Yes - put it into the string as UTF-8.
|
||||||
|
* This means that if it was an overlong
|
||||||
|
* encoding, this will put out the right
|
||||||
|
* sized encoding.
|
||||||
|
*/
|
||||||
|
if (uc < 0x80) {
|
||||||
|
first = 0;
|
||||||
|
utf8_len = 1;
|
||||||
|
} else if (uc < 0x800) {
|
||||||
|
first = 0xc0;
|
||||||
|
utf8_len = 2;
|
||||||
|
} else if (uc < 0x10000) {
|
||||||
|
first = 0xe0;
|
||||||
|
utf8_len = 3;
|
||||||
|
} else if (uc < 0x200000) {
|
||||||
|
first = 0xf0;
|
||||||
|
utf8_len = 4;
|
||||||
|
} else if (uc < 0x4000000) {
|
||||||
|
/*
|
||||||
|
* This should never happen, as Unicode doesn't
|
||||||
|
* go that high.
|
||||||
|
*/
|
||||||
|
first = 0xf8;
|
||||||
|
utf8_len = 5;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* This should never happen, as Unicode doesn't
|
||||||
|
* go that high.
|
||||||
|
*/
|
||||||
|
first = 0xfc;
|
||||||
|
utf8_len = 6;
|
||||||
|
}
|
||||||
|
FMTBUF_EXPAND(utf8_len);
|
||||||
|
for (int i = utf8_len - 1; i > 0; i--) {
|
||||||
|
fmtbuf[column + i] = (uc & 0x3f) | 0x80;
|
||||||
|
uc >>= 6;
|
||||||
|
}
|
||||||
|
fmtbuf[column] = uc | first;
|
||||||
|
column += utf8_len;
|
||||||
|
} else if (g_unichar_isspace(uc)) {
|
||||||
|
/*
|
||||||
|
* Not printable, but is a space character; show it
|
||||||
|
* as a blank.
|
||||||
|
*
|
||||||
|
* Make sure there's enough room for one more byte,
|
||||||
|
* and add the blank.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(1);
|
||||||
|
FMTBUF_PUTCHAR(' ');
|
||||||
|
} else if (c < 128) {
|
||||||
|
/*
|
||||||
|
* ASCII, but not printable.
|
||||||
|
* Yes, this could happen with an overlong encoding.
|
||||||
|
*
|
||||||
|
* That requires a minimum of 2 bytes, one for the
|
||||||
|
* backslash and one for a letter, so make sure we
|
||||||
|
* have enough room for that, plus a trailing '\0'.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(2);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
|
switch (c) {
|
||||||
|
|
||||||
|
case '\a':
|
||||||
|
FMTBUF_PUTCHAR('a');
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\b':
|
||||||
|
FMTBUF_PUTCHAR('b'); /* BS */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\f':
|
||||||
|
FMTBUF_PUTCHAR('f'); /* FF */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\n':
|
||||||
|
FMTBUF_PUTCHAR('n'); /* NL */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\r':
|
||||||
|
FMTBUF_PUTCHAR('r'); /* CR */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\t':
|
||||||
|
FMTBUF_PUTCHAR('t'); /* tab */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case '\v':
|
||||||
|
FMTBUF_PUTCHAR('v');
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
/*
|
||||||
|
* We've already put the backslash, but this
|
||||||
|
* will put 3 more characters for the octal
|
||||||
|
* number; make sure we have enough room for
|
||||||
|
* that, plus the trailing '\0'.
|
||||||
|
*/
|
||||||
|
FMTBUF_EXPAND(3);
|
||||||
|
FMTBUF_PUTBYTE_OCTAL(c);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Unicode, but not printable, and not ASCII;
|
||||||
|
* put it out as \uxxxx or \Uxxxxxxxx.
|
||||||
|
*/
|
||||||
|
if (uc <= 0xFFFF) {
|
||||||
|
FMTBUF_EXPAND(6);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
|
FMTBUF_PUTCHAR('u');
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||||
|
} else {
|
||||||
|
FMTBUF_EXPAND(10);
|
||||||
|
FMTBUF_PUTCHAR('\\');
|
||||||
|
FMTBUF_PUTCHAR('U');
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 28) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 24) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 20) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 16) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 12) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 8) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 4) & 0xF]);
|
||||||
|
FMTBUF_PUTCHAR(hex[(uc >> 0) & 0xF]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmtbuf[column] = '\0';
|
|
||||||
|
FMTBUF_ENDSTR;
|
||||||
return fmtbuf;
|
return fmtbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -326,53 +793,37 @@ format_text_wsp(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
||||||
* characters as the chr parameter passed, except a whitespace character
|
* characters as the chr parameter passed, except a whitespace character
|
||||||
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
* (space, tab, carriage return, new line, vertical tab, or formfeed)
|
||||||
* which will be replaced by a space, and return a pointer to it.
|
* which will be replaced by a space, and return a pointer to it.
|
||||||
|
*
|
||||||
|
* This does *not* treat the input string as UTF-8.
|
||||||
|
*
|
||||||
|
* XXX - is there any reason to use this?
|
||||||
*/
|
*/
|
||||||
gchar *
|
gchar *
|
||||||
format_text_chr(wmem_allocator_t* allocator, const guchar *string, const size_t len, const guchar chr)
|
format_text_chr(wmem_allocator_t* allocator, const guchar *string, const size_t len, const guchar chr)
|
||||||
{
|
{
|
||||||
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
|
FMTBUF_VARS;
|
||||||
int fmtbuf_len = INITIAL_FMTBUF_SIZE;
|
|
||||||
int column;
|
|
||||||
const guchar *stringend = string + len;
|
const guchar *stringend = string + len;
|
||||||
guchar c;
|
guchar c;
|
||||||
|
|
||||||
column = 0;
|
|
||||||
while (string < stringend)
|
while (string < stringend)
|
||||||
{
|
{
|
||||||
/*
|
FMTBUF_EXPAND(1);
|
||||||
* Is there enough room for this character,
|
|
||||||
* and also enough room for a terminating '\0'?
|
|
||||||
*/
|
|
||||||
if (column+1 >= fmtbuf_len)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Double the buffer's size if it's not big enough.
|
|
||||||
* The size of the buffer starts at 128, so doubling its size
|
|
||||||
* adds at least another 128 bytes, which is more than enough
|
|
||||||
* for one more character plus a terminating '\0'.
|
|
||||||
*/
|
|
||||||
fmtbuf_len *= 2;
|
|
||||||
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
|
|
||||||
}
|
|
||||||
c = *string++;
|
c = *string++;
|
||||||
|
|
||||||
if (g_ascii_isprint(c))
|
if (g_ascii_isprint(c))
|
||||||
{
|
{
|
||||||
fmtbuf[column] = c;
|
FMTBUF_PUTCHAR(c);
|
||||||
column++;
|
|
||||||
}
|
}
|
||||||
else if (g_ascii_isspace(c))
|
else if (g_ascii_isspace(c))
|
||||||
{
|
{
|
||||||
fmtbuf[column] = ' ';
|
FMTBUF_PUTCHAR(' ');
|
||||||
column++;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fmtbuf[column] = chr;
|
FMTBUF_PUTCHAR(chr);
|
||||||
column++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmtbuf[column] = '\0';
|
FMTBUF_ENDSTR;
|
||||||
return fmtbuf;
|
return fmtbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -680,12 +1131,11 @@ uri_str_to_bytes(const char *uri_str, GByteArray *bytes)
|
||||||
gchar *
|
gchar *
|
||||||
format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *reserved_chars)
|
format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *reserved_chars)
|
||||||
{
|
{
|
||||||
gchar *fmtbuf = (gchar*)wmem_alloc(allocator, INITIAL_FMTBUF_SIZE);
|
FMTBUF_VARS;
|
||||||
guint fmtbuf_len = INITIAL_FMTBUF_SIZE;
|
static const guchar reserved_def[] = ":/?#[]@!$&'()*+,;= ";
|
||||||
static const guchar *reserved_def = ":/?#[]@!$&'()*+,;= ";
|
|
||||||
const guchar *reserved = reserved_def;
|
const guchar *reserved = reserved_def;
|
||||||
guint8 c;
|
guint8 c;
|
||||||
guint byte_index, column, i;
|
guint byte_index, i;
|
||||||
gboolean is_reserved = FALSE;
|
gboolean is_reserved = FALSE;
|
||||||
|
|
||||||
if (! bytes)
|
if (! bytes)
|
||||||
|
@ -694,23 +1144,13 @@ format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *re
|
||||||
if (reserved_chars)
|
if (reserved_chars)
|
||||||
reserved = reserved_chars;
|
reserved = reserved_chars;
|
||||||
|
|
||||||
column = 0;
|
|
||||||
for (byte_index = 0; byte_index < bytes->len; byte_index++) {
|
for (byte_index = 0; byte_index < bytes->len; byte_index++) {
|
||||||
/*
|
/*
|
||||||
* Is there enough room for this character, if it expands to
|
* Make sure there is enough room for this character, if it
|
||||||
* a percent plus 2 hex digits (which is the most it can
|
* expands to a percent plus 2 hex digits (which is the most
|
||||||
* expand to), and also enough room for a terminating '\0'?
|
* it can expand to), and also enough room for a terminating '\0'.
|
||||||
*/
|
*/
|
||||||
if (column+2+1 >= fmtbuf_len) {
|
FMTBUF_EXPAND(2);
|
||||||
/*
|
|
||||||
* Double the buffer's size if it's not big enough.
|
|
||||||
* The size of the buffer starts at 128, so doubling its size
|
|
||||||
* adds at least another 128 bytes, which is more than enough
|
|
||||||
* for one more character plus a terminating '\0'.
|
|
||||||
*/
|
|
||||||
fmtbuf_len *= 2;
|
|
||||||
fmtbuf = (gchar *)wmem_realloc(allocator, fmtbuf, fmtbuf_len);
|
|
||||||
}
|
|
||||||
c = bytes->data[byte_index];
|
c = bytes->data[byte_index];
|
||||||
|
|
||||||
is_reserved = FALSE;
|
is_reserved = FALSE;
|
||||||
|
@ -724,15 +1164,11 @@ format_uri(wmem_allocator_t* allocator, const GByteArray *bytes, const gchar *re
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_reserved) {
|
if (!is_reserved) {
|
||||||
fmtbuf[column] = c;
|
FMTBUF_PUTCHAR(c);
|
||||||
column++;
|
|
||||||
} else {
|
} else {
|
||||||
fmtbuf[column] = '%';
|
FMTBUF_PUTCHAR('%');
|
||||||
column++;
|
FMTBUF_PUTCHAR(hex[c >> 4]);
|
||||||
fmtbuf[column] = hex[c >> 4];
|
FMTBUF_PUTCHAR(hex[c & 0xF]);
|
||||||
column++;
|
|
||||||
fmtbuf[column] = hex[c & 0xF];
|
|
||||||
column++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmtbuf[column] = '\0';
|
fmtbuf[column] = '\0';
|
||||||
|
|
Loading…
Reference in New Issue