charsets: Don't add illegal Unicode codepoints for UTF-16, UTF-32

If a character is not a valid Unicode codepoint, i.e. one of the code points reserved for surrogate pairs or a code point above 0x10FFFF, don't add it to a wmem_strbuf when converting from other encodings but add a replacement character instead, by using a new wmem_strbuf_append_unichar_validated() function. Now we produce valid UTF-8 in various situations where UCS-2 or UTF-32 can encode unpaired surrogate codepoints. Consolidate some related checks that are now redundant. Also add a replacement character to the end of invalid UCS-2 strings with an odd number of bytes, as done with UTF-16 and UTF-32. Fix #18508
2022-10-18 23:18:37 -04:00 · 2022-10-18 23:18:37 -04:00 · 7a4d05d63a
parent 5af53da434
commit 7a4d05d63a
4 changed files with 25 additions and 35 deletions
--- a/epan/charsets.c
+++ b/epan/charsets.c
@ -805,11 +805,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
 *
 * Specify length in bytes.
- *
- * XXX - should map lead and trail surrogate values to REPLACEMENT
- * CHARACTERs (0xFFFD)?
- * XXX - if there are an odd number of bytes, should put a
- * REPLACEMENT CHARACTER at the end.
 */
 guint8 *
 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -826,13 +821,16 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
        }else{
            uchar = pletoh16(ptr + i);
        }
-        wmem_strbuf_append_unichar(strbuf, uchar);
+        wmem_strbuf_append_unichar_validated(strbuf, uchar);
    }

    /*
-     * XXX - if i < length, this means we were handed an odd
-     * number of bytes, so we're not a valid UCS-2 string.
+     * If i < length, this means we were handed an odd number of bytes;
+     * insert a REPLACEMENT CHARACTER to mark the error.
     */
+    if (i < length) {
+        wmem_strbuf_append_unichar_repl(strbuf);
+    }
    return (guint8 *) wmem_strbuf_finalize(strbuf);
 }

@ -846,8 +844,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
 *
 * Specify length in bytes.
- *
- * XXX - should map invalid Unicode characters to REPLACEMENT CHARACTERs.
 */
 guint8 *
 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -936,9 +932,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
 *
 * Specify length in bytes
- *
- * XXX - should map lead and trail surrogate values to a "substitute"
- * UTF-8 character?
 */
 guint8 *
 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -955,12 +948,7 @@ get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
        else
            uchar = pletoh32(ptr + i);

-        if (uchar > 0x10FFFF) {
-            /* Code points above 0x10FFFF are not legal */
-            wmem_strbuf_append_unichar(strbuf, UNREPL);
-        } else {
-            wmem_strbuf_append_unichar(strbuf, uchar);
-        }
+        wmem_strbuf_append_unichar_validated(strbuf, uchar);
    }

    /*
@ -1281,13 +1269,11 @@ get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
            /*
             * XXX - if saw_escape is true, this is bogus.
             *
-             * XXX - should map lead and trail surrogate values to
-             * REPLACEMENT CHARACTERs (0xFFFD)?
             * XXX - if there are an odd number of bytes, should put a
             * REPLACEMENT CHARACTER at the end.
             */
            uchar = ucs2_base + (byte & 0x7f);
-            wmem_strbuf_append_unichar(strbuf, uchar);
+            wmem_strbuf_append_unichar_validated(strbuf, uchar);
        }
    }

--- a/epan/charsets.h
+++ b/epan/charsets.h
@ -130,11 +130,6 @@ get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, con
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
 *
 * Specify length in bytes.
- *
- * XXX - should map lead and trail surrogate values to REPLACEMENT
- * CHARACTERs (0xFFFD)?
- * XXX - if there are an odd number of bytes, should put a
- * REPLACEMENT CHARACTER at the end.
 */
 WS_DLL_PUBLIC guint8 *
 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -149,11 +144,6 @@ get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
 *
 * Specify length in bytes.
- *
- * XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
- * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
- * XXX - if there are an odd number of bytes, should put a
- * REPLACEMENT CHARACTER at the end.
 */
 WS_DLL_PUBLIC guint8 *
 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -166,9 +156,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
 *
 * Specify length in bytes
- *
- * XXX - should map lead and trail surrogate values to a "substitute"
- * UTF-8 character?
 */
 WS_DLL_PUBLIC guint8 *
 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
--- a/wsutil/wmem/wmem_strbuf.c
+++ b/wsutil/wmem/wmem_strbuf.c
@ -252,6 +252,16 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c)
    }
 }

+void
+wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c)
+{
+    if (g_unichar_validate(c)) {
+        wmem_strbuf_append_unichar(strbuf, c);
+    } else {
+        wmem_strbuf_append_unichar(strbuf, UNICODE_REPLACEMENT_CHARACTER);
+    }
+}
+
 static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
                              '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

--- a/wsutil/wmem/wmem_strbuf.h
+++ b/wsutil/wmem/wmem_strbuf.h
@ -107,6 +107,13 @@ wmem_strbuf_append_unichar(wmem_strbuf_t *strbuf, const gunichar c);
 #define wmem_strbuf_append_unichar_repl(buf) \
            wmem_strbuf_append_unichar(buf, UNICODE_REPLACEMENT_CHARACTER)

+/* As wmem_strbuf_append_unichar but appends a REPLACEMENT CHARACTER
+ * instead for any invalid Unicode codepoints.
+ */
+WS_DLL_PUBLIC
+void
+wmem_strbuf_append_unichar_validated(wmem_strbuf_t *strbuf, const gunichar c);
+
 WS_DLL_PUBLIC
 void
 wmem_strbuf_append_hex(wmem_strbuf_t *strbuf, uint8_t);