charsets: Don't add illegal Unicode codepoints for UTF-16, UTF-32

If a character is not a valid Unicode codepoint, i.e. one of
the code points reserved for surrogate pairs or a code point
above 0x10FFFF, don't add it to a wmem_strbuf when converting
from other encodings but add a replacement character instead, by
using a new wmem_strbuf_append_unichar_validated() function.

Now we produce valid UTF-8 in various situations where UCS-2 or UTF-32
can encode unpaired surrogate codepoints. Consolidate some related
checks that are now redundant.

Also add a replacement character to the end of invalid UCS-2 strings
with an odd number of bytes, as done with UTF-16 and UTF-32.

Fix #18508
This commit is contained in:
John Thacker 2022-10-18 23:18:37 -04:00
parent 5af53da434
commit 7a4d05d63a
4 changed files with 25 additions and 35 deletions

View File

@ -805,11 +805,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes.
*
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -826,13 +821,16 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
}else{
uchar = pletoh16(ptr + i);
}
wmem_strbuf_append_unichar(strbuf, uchar);
wmem_strbuf_append_unichar_validated(strbuf, uchar);
}
/*
* XXX - if i < length, this means we were handed an odd
* number of bytes, so we're not a valid UCS-2 string.
* If i < length, this means we were handed an odd number of bytes;
* insert a REPLACEMENT CHARACTER to mark the error.
*/
if (i < length) {
wmem_strbuf_append_unichar_repl(strbuf);
}
return (guint8 *) wmem_strbuf_finalize(strbuf);
}
@ -846,8 +844,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes.
*
* XXX - should map invalid Unicode characters to REPLACEMENT CHARACTERs.
*/
guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -936,9 +932,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
*/
guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -955,12 +948,7 @@ get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
else
uchar = pletoh32(ptr + i);
if (uchar > 0x10FFFF) {
/* Code points above 0x10FFFF are not legal */
wmem_strbuf_append_unichar(strbuf, UNREPL);
} else {
wmem_strbuf_append_unichar(strbuf, uchar);
}
wmem_strbuf_append_unichar_validated(strbuf, uchar);
}
/*
@ -1281,13 +1269,11 @@ get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
/*
* XXX - if saw_escape is true, this is bogus.
*
* XXX - should map lead and trail surrogate values to
* REPLACEMENT CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
uchar = ucs2_base + (byte & 0x7f);
wmem_strbuf_append_unichar(strbuf, uchar);
wmem_strbuf_append_unichar_validated(strbuf, uchar);
}
}

View File

@ -130,11 +130,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes.
*
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
WS_DLL_PUBLIC guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -149,11 +144,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes.
*
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
WS_DLL_PUBLIC guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -166,9 +156,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
*/
WS_DLL_PUBLIC guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);

View File

@ -252,6 +252,16 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
}
}
void
wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c)
{
if (g_unichar_validate(c)) {
wmem_strbuf_append_unichar(strbuf, c);
} else {
wmem_strbuf_append_unichar(strbuf, UNICODE_REPLACEMENT_CHARACTER);
}
}
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

View File

@ -107,6 +107,13 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);
#define wmem_strbuf_append_unichar_repl(buf) \
wmem_strbuf_append_unichar(buf, UNICODE_REPLACEMENT_CHARACTER)
/* As wmem_strbuf_append_unichar but appends a REPLACEMENT CHARACTER
* instead for any invalid Unicode codepoints.
*/
WS_DLL_PUBLIC
void
wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c);
WS_DLL_PUBLIC
void
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);