charsets: Don't add illegal Unicode codepoints for UTF-16, UTF-32
If a character is not a valid Unicode codepoint, i.e. one of the code points reserved for surrogate pairs or a code point above 0x10FFFF, don't add it to a wmem_strbuf when converting from other encodings but add a replacement character instead, by using a new wmem_strbuf_append_unichar_validated() function. Now we produce valid UTF-8 in various situations where UCS-2 or UTF-32 can encode unpaired surrogate codepoints. Consolidate some related checks that are now redundant. Also add a replacement character to the end of invalid UCS-2 strings with an odd number of bytes, as done with UTF-16 and UTF-32. Fix #18508
This commit is contained in:
parent
5af53da434
commit
7a4d05d63a
|
@ -805,11 +805,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
|
|||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*
|
||||
* XXX - should map lead and trail surrogate values to REPLACEMENT
|
||||
* CHARACTERs (0xFFFD)?
|
||||
* XXX - if there are an odd number of bytes, should put a
|
||||
* REPLACEMENT CHARACTER at the end.
|
||||
*/
|
||||
guint8 *
|
||||
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
|
||||
|
@ -826,13 +821,16 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
}else{
|
||||
uchar = pletoh16(ptr + i);
|
||||
}
|
||||
wmem_strbuf_append_unichar(strbuf, uchar);
|
||||
wmem_strbuf_append_unichar_validated(strbuf, uchar);
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX - if i < length, this means we were handed an odd
|
||||
* number of bytes, so we're not a valid UCS-2 string.
|
||||
* If i < length, this means we were handed an odd number of bytes;
|
||||
* insert a REPLACEMENT CHARACTER to mark the error.
|
||||
*/
|
||||
if (i < length) {
|
||||
wmem_strbuf_append_unichar_repl(strbuf);
|
||||
}
|
||||
return (guint8 *) wmem_strbuf_finalize(strbuf);
|
||||
}
|
||||
|
||||
|
@ -846,8 +844,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*
|
||||
* XXX - should map invalid Unicode characters to REPLACEMENT CHARACTERs.
|
||||
*/
|
||||
guint8 *
|
||||
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
|
||||
|
@ -936,9 +932,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
|
||||
*
|
||||
* Specify length in bytes
|
||||
*
|
||||
* XXX - should map lead and trail surrogate values to a "substitute"
|
||||
* UTF-8 character?
|
||||
*/
|
||||
guint8 *
|
||||
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
|
||||
|
@ -955,12 +948,7 @@ get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
else
|
||||
uchar = pletoh32(ptr + i);
|
||||
|
||||
if (uchar > 0x10FFFF) {
|
||||
/* Code points above 0x10FFFF are not legal */
|
||||
wmem_strbuf_append_unichar(strbuf, UNREPL);
|
||||
} else {
|
||||
wmem_strbuf_append_unichar(strbuf, uchar);
|
||||
}
|
||||
wmem_strbuf_append_unichar_validated(strbuf, uchar);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1281,13 +1269,11 @@ get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
|
|||
/*
|
||||
* XXX - if saw_escape is true, this is bogus.
|
||||
*
|
||||
* XXX - should map lead and trail surrogate values to
|
||||
* REPLACEMENT CHARACTERs (0xFFFD)?
|
||||
* XXX - if there are an odd number of bytes, should put a
|
||||
* REPLACEMENT CHARACTER at the end.
|
||||
*/
|
||||
uchar = ucs2_base + (byte & 0x7f);
|
||||
wmem_strbuf_append_unichar(strbuf, uchar);
|
||||
wmem_strbuf_append_unichar_validated(strbuf, uchar);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -130,11 +130,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
|
|||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*
|
||||
* XXX - should map lead and trail surrogate values to REPLACEMENT
|
||||
* CHARACTERs (0xFFFD)?
|
||||
* XXX - if there are an odd number of bytes, should put a
|
||||
* REPLACEMENT CHARACTER at the end.
|
||||
*/
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
||||
|
@ -149,11 +144,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
||||
*
|
||||
* Specify length in bytes.
|
||||
*
|
||||
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
|
||||
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
|
||||
* XXX - if there are an odd number of bytes, should put a
|
||||
* REPLACEMENT CHARACTER at the end.
|
||||
*/
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
||||
|
@ -166,9 +156,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
|
|||
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
|
||||
*
|
||||
* Specify length in bytes
|
||||
*
|
||||
* XXX - should map lead and trail surrogate values to a "substitute"
|
||||
* UTF-8 character?
|
||||
*/
|
||||
WS_DLL_PUBLIC guint8 *
|
||||
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
||||
|
|
|
@ -252,6 +252,16 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c)
|
||||
{
|
||||
if (g_unichar_validate(c)) {
|
||||
wmem_strbuf_append_unichar(strbuf, c);
|
||||
} else {
|
||||
wmem_strbuf_append_unichar(strbuf, UNICODE_REPLACEMENT_CHARACTER);
|
||||
}
|
||||
}
|
||||
|
||||
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
|
||||
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
|
||||
|
||||
|
|
|
@ -107,6 +107,13 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);
|
|||
#define wmem_strbuf_append_unichar_repl(buf) \
|
||||
wmem_strbuf_append_unichar(buf, UNICODE_REPLACEMENT_CHARACTER)
|
||||
|
||||
/* As wmem_strbuf_append_unichar but appends a REPLACEMENT CHARACTER
|
||||
* instead for any invalid Unicode codepoints.
|
||||
*/
|
||||
WS_DLL_PUBLIC
|
||||
void
|
||||
wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c);
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
void
|
||||
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);
|
||||
|
|
Loading…
Reference in New Issue