epan: Fix format_text treament of Greek, Arabic, etc.
format_text uses the wrong bitmask when checking for two byte UTF-8 characters, resulting in rejecting half the possible two bytes characters, including all of Arabic and Greek, and substituting REPLACEMENT CHARACTER for them. Fixes #17070, and add some comments about the current behavior that doesn't match existing comments.
This commit is contained in:
parent
a61b6d277b
commit
770746cca8
|
@ -208,7 +208,14 @@ get_token_len(const guchar *linep, const guchar *lineend,
|
||||||
* their universal character names;
|
* their universal character names;
|
||||||
*
|
*
|
||||||
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
* shows illegal UTF-8 sequences as a sequence of bytes represented
|
||||||
* as C-style hex escapes;
|
* as C-style hex escapes (XXX: Does not actually do this. Some illegal
|
||||||
|
* sequences, such as overlong encodings, the sequences reserved for
|
||||||
|
* UTF-16 surrogate halves (paired or unpaired), and values outside
|
||||||
|
* Unicode (i.e., the old sequences for code points above U+10FFFF)
|
||||||
|
* will be decoded in a permissive way. Other illegal sequences,
|
||||||
|
* such 0xFE and 0xFF and the presence of a continuation byte where
|
||||||
|
* not expected (or vice versa its absence), are replaced with
|
||||||
|
* REPLACEMENT CHARACTER.)
|
||||||
*
|
*
|
||||||
* and return a pointer to it.
|
* and return a pointer to it.
|
||||||
*/
|
*/
|
||||||
|
@ -294,7 +301,7 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
||||||
gunichar uc;
|
gunichar uc;
|
||||||
guchar first;
|
guchar first;
|
||||||
|
|
||||||
if ((c & 0xe8) == 0xc0) {
|
if ((c & 0xe0) == 0xc0) {
|
||||||
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
|
/* Starts a 2-byte UTF-8 sequence; 1 byte left */
|
||||||
utf8_len = 1;
|
utf8_len = 1;
|
||||||
mask = 0x1f;
|
mask = 0x1f;
|
||||||
|
@ -315,7 +322,7 @@ format_text(wmem_allocator_t* allocator, const guchar *string, size_t len)
|
||||||
utf8_len = 5;
|
utf8_len = 5;
|
||||||
mask = 0x01;
|
mask = 0x01;
|
||||||
} else {
|
} else {
|
||||||
/* 0xfe or 0xff - not valid */
|
/* 0xfe or 0xff or a continuation byte - not valid */
|
||||||
utf8_len = -1;
|
utf8_len = -1;
|
||||||
}
|
}
|
||||||
if (utf8_len > 0) {
|
if (utf8_len > 0) {
|
||||||
|
|
Loading…
Reference in New Issue