charsets: UCS-4 code points above 0x10FFFFF are not legal

When decoding UCS-4/UTF-32, map Unicode code points above 0x10FFFFF to REPLACEMENT CHARACTER, as they are not legal, and would create invalid UTF-8. Also if the number of bytes given is not a multiple of 4, insert a replacement character at the end as well. This is two long standing todos. Fixes #18435.
2022-10-11 20:40:09 -04:00 · 2022-10-11 20:40:09 -04:00 · 5bc8cac5cc
parent 78ce2f2907
commit 5bc8cac5cc
2 changed files with 12 additions and 10 deletions
--- a/epan/charsets.c
+++ b/epan/charsets.c
@ -935,9 +935,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 *
 * XXX - should map lead and trail surrogate values to a "substitute"
 * UTF-8 character?
- * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
- * XXX - if the number of bytes isn't a multiple of 4, should put a
- * REPLACEMENT CHARACTER at the end.
 */
 guint8 *
 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
@ -954,14 +951,22 @@ get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
        else
            uchar = pletoh32(ptr + i);

-        wmem_strbuf_append_unichar(strbuf, uchar);
+        if (uchar > 0x10FFFF) {
+            /* Code points above 0x10FFFF are not legal */
+            wmem_strbuf_append_unichar(strbuf, UNREPL);
+        } else {
+            wmem_strbuf_append_unichar(strbuf, uchar);
+        }
    }

    /*
-     * XXX - if i < length, this means we were handed a number
-     * of bytes that's not a multiple of 4, so we're not a valid
-     * UCS-4 string.
+     * if i < length, this means we were handed a number of bytes
+     * that's not a multiple of 4, so not a valid UCS-4 string.
+     * Insert a REPLACEMENT CHARACTER for the remaining bytes.
     */
+    if (i < length) {
+        wmem_strbuf_append_unichar(strbuf, UNREPL);
+    }
    return (guint8 *)wmem_strbuf_finalize(strbuf);
 }

--- a/epan/charsets.h
+++ b/epan/charsets.h
@ -169,9 +169,6 @@ get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const
 *
 * XXX - should map lead and trail surrogate values to a "substitute"
 * UTF-8 character?
- * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
- * XXX - if the number of bytes isn't a multiple of 4, should put a
- * REPLACEMENT CHARACTER at the end.
 */
 WS_DLL_PUBLIC guint8 *
 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);