charsets: Don't add illegal Unicode codepoints for UTF-16, UTF-32

If a character is not a valid Unicode codepoint, i.e. one of
the code points reserved for surrogate pairs or a code point
above 0x10FFFF, don't add it to a wmem_strbuf when converting
from other encodings but add a replacement character instead, by
using a new wmem_strbuf_append_unichar_validated() function.

Now we produce valid UTF-8 in various situations where UCS-2 or UTF-32
can encode unpaired surrogate codepoints. Consolidate some related
checks that are now redundant.

Also add a replacement character to the end of invalid UCS-2 strings
with an odd number of bytes, as done with UTF-16 and UTF-32.

Fix #18508
This commit is contained in:
John Thacker 2022-10-18 23:18:37 -04:00
parent 5af53da434
commit 7a4d05d63a
4 changed files with 25 additions and 35 deletions

View File

@ -805,11 +805,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* *
* Specify length in bytes. * Specify length in bytes.
*
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/ */
guint8 * guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding) get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -826,13 +821,16 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
}else{ }else{
uchar = pletoh16(ptr + i); uchar = pletoh16(ptr + i);
} }
wmem_strbuf_append_unichar(strbuf, uchar); wmem_strbuf_append_unichar_validated(strbuf, uchar);
} }
/* /*
* XXX - if i < length, this means we were handed an odd * If i < length, this means we were handed an odd number of bytes;
* number of bytes, so we're not a valid UCS-2 string. * insert a REPLACEMENT CHARACTER to mark the error.
*/ */
if (i < length) {
wmem_strbuf_append_unichar_repl(strbuf);
}
return (guint8 *) wmem_strbuf_finalize(strbuf); return (guint8 *) wmem_strbuf_finalize(strbuf);
} }
@ -846,8 +844,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* *
* Specify length in bytes. * Specify length in bytes.
*
* XXX - should map invalid Unicode characters to REPLACEMENT CHARACTERs.
*/ */
guint8 * guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding) get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -936,9 +932,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* *
* Specify length in bytes * Specify length in bytes
*
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
*/ */
guint8 * guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding) get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -955,12 +948,7 @@ get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
else else
uchar = pletoh32(ptr + i); uchar = pletoh32(ptr + i);
if (uchar > 0x10FFFF) { wmem_strbuf_append_unichar_validated(strbuf, uchar);
/* Code points above 0x10FFFF are not legal */
wmem_strbuf_append_unichar(strbuf, UNREPL);
} else {
wmem_strbuf_append_unichar(strbuf, uchar);
}
} }
/* /*
@ -1281,13 +1269,11 @@ get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
/* /*
* XXX - if saw_escape is true, this is bogus. * XXX - if saw_escape is true, this is bogus.
* *
* XXX - should map lead and trail surrogate values to
* REPLACEMENT CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a * XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end. * REPLACEMENT CHARACTER at the end.
*/ */
uchar = ucs2_base + (byte & 0x7f); uchar = ucs2_base + (byte & 0x7f);
wmem_strbuf_append_unichar(strbuf, uchar); wmem_strbuf_append_unichar_validated(strbuf, uchar);
} }
} }

View File

@ -130,11 +130,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* *
* Specify length in bytes. * Specify length in bytes.
*
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/ */
WS_DLL_PUBLIC guint8 * WS_DLL_PUBLIC guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding); get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -149,11 +144,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN. * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
* *
* Specify length in bytes. * Specify length in bytes.
*
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/ */
WS_DLL_PUBLIC guint8 * WS_DLL_PUBLIC guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding); get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -166,9 +156,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* *
* Specify length in bytes * Specify length in bytes
*
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
*/ */
WS_DLL_PUBLIC guint8 * WS_DLL_PUBLIC guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding); get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);

View File

@ -252,6 +252,16 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
} }
} }
void
wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c)
{
if (g_unichar_validate(c)) {
wmem_strbuf_append_unichar(strbuf, c);
} else {
wmem_strbuf_append_unichar(strbuf, UNICODE_REPLACEMENT_CHARACTER);
}
}
static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

View File

@ -107,6 +107,13 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);
#define wmem_strbuf_append_unichar_repl(buf) \ #define wmem_strbuf_append_unichar_repl(buf) \
wmem_strbuf_append_unichar(buf, UNICODE_REPLACEMENT_CHARACTER) wmem_strbuf_append_unichar(buf, UNICODE_REPLACEMENT_CHARACTER)
/* As wmem_strbuf_append_unichar but appends a REPLACEMENT CHARACTER
* instead for any invalid Unicode codepoints.
*/
WS_DLL_PUBLIC
void
wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c);
WS_DLL_PUBLIC WS_DLL_PUBLIC
void void
wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t); wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);