Use iconv to support GB 18030 and EUC-KR, allow future encodings

Add support internally to using iconv (always present with glib) to convert strings from various encodings to UTF-8 (using REPLACEMENT CHARACTER as recommended), and use that to support GB 18030 and EUC-KR. Replace call directly to iconv in ANSI 637 for EUC-KR to new API. Update comments and documentation around character encodings. It is possible to replace the calls to iconv with an internal decoder later. Tested on Linux and on Windows (including with illegal characters). Closes #16630.
2020-10-18 19:28:01 -04:00 · 2020-10-18 19:28:01 -04:00 · e20bd408de
parent ad69ec2e11
commit e20bd408de
8 changed files with 277 additions and 32 deletions
--- a/debian/libwireshark0.symbols
+++ b/debian/libwireshark0.symbols
@ -798,6 +798,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
 get_eth_hashtable@Base 1.12.0~rc1
 get_ether_name@Base 1.9.1
 get_etsi_ts_102_221_annex_a_string@Base 3.3.1
+ get_euc_kr_string@Base 3.3.2
 get_follow_address_func@Base 2.1.0
 get_follow_by_name@Base 2.1.0
 get_follow_conv_func@Base 2.1.0
@ -807,6 +808,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
 get_follow_tap_handler@Base 2.1.0
 get_follow_tap_string@Base 2.1.0
 get_export_pdu_tap_list@Base 1.99.0
+ get_gb18030_string@Base 3.3.2
 get_hash_ether_status@Base 1.99.3
 get_hash_ether_hexaddr@Base 1.99.3
 get_hash_ether_resolved_name@Base 1.99.3
--- a/doc/README.dissector
+++ b/doc/README.dissector
@ -1759,9 +1759,13 @@ currently supported are:
    ENC_ISO_8859_14 - ISO 8859-14
    ENC_ISO_8859_15 - ISO 8859-15
    ENC_ISO_8859_16 - ISO 8859-16
-    ENC_WINDOWS_1250 - Windows-1250
    ENC_3GPP_TS_23_038_7BITS - GSM 7 bits alphabet as described
        in 3GPP TS 23.038
+    ENC_3GPP_TS_23_038_7BITS_UNPACKED - GSM 7 bits alphabet where each
+        7 bit character occupies a distinct octet
+    ENC_ETSI_TS_102_221_ANNEX_A - Coding scheme for SIM cards with GSM 7 bit
+        alphabet, UCS-2 characters, or a mixture of the two as described
+        in ETSI TS 102 221 Annex A
    ENC_EBCDIC - EBCDIC
    ENC_EBCDIC_CP037 - EBCDIC code page 037
    ENC_MAC_ROMAN - MAC ROMAN
@ -1773,6 +1777,8 @@ currently supported are:
    ENC_BCD_DIGITS_0_9 - packed BCD (one digit per nibble), digits 0-9
    ENC_KEYPAD_ABC_TBCD - keypad-with-a/b/c "telephony packed BCD" = 0-9, *, #, a, b, c
    ENC_KEYPAD_BC_TBCD - keypad-with-B/C "telephony packed BCD" = 0-9, B, C, *, #
+    ENC_GB18030 - GB 18030
+    ENC_EUC_KR - EUC-KR

 Other encodings will be added in the future.

--- a/epan/charsets.c
+++ b/epan/charsets.c
@ -10,6 +10,7 @@

 #include "config.h"

+#include <errno.h>
 #include <glib.h>

 #include <epan/proto.h>
@ -91,6 +92,13 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
 * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
 * according to the recommended "best practices" given in the Unicode
 * Standard and specified by W3C/WHATWG.
+ *
+ * Note that in conformance with the Unicode Standard, this treats three
+ * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
+ * and two byte overlong encodings of 7-bit ASCII characters as invalid and
+ * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
+ * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
+ * be added later.
 */
 guint8 *
 get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
@ -1552,6 +1560,151 @@ get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint le
    return (guint8 *) wmem_strbuf_finalize(str);
 }

+/*
+ * Given a wmem scope, a pointer, a length, and a string referring to an
+ * encoding (recognized by iconv), treat the bytes referred to by the pointer
+ * and length as a string in that encoding, and return a pointer to a UTF-8
+ * string, allocated using the wmem scope, converted from the original
+ * encoding having substituted REPLACEMENT CHARACTER according to the
+ * Unicode Standard 5.22 U+FFFD Substitution for Conversion
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ */
+static guint8 *
+get_string_enc_iconv(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gchar *encoding)
+{
+    GIConv cd;
+    gsize inbytes, outbytes;
+    gsize tempstr_size, bytes_written;
+    gsize err;
+    gsize max_subpart, tempinbytes;
+    gchar *outptr, *tempstr;
+
+    wmem_strbuf_t *str;
+
+    if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
+        REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding);
+        /* Most likely to be a programming error passing in a bad encoding
+         * name. However, could be a issue with the iconv support on the
+         * system running WS. GLib requires iconv/libiconv, but is it possible
+         * that some versions don't support all common encodings? */
+    }
+
+    inbytes = length;
+    str = wmem_strbuf_sized_new(scope, length+1, 0);
+    /* XXX: If speed becomes an issue, the faster way to do this would
+     * involve passing the wmem_strbuf_t's string buffer directly into
+     * g_iconv to avoid a memcpy later, but that requires changes to the
+     * wmem_strbuf interface to have non const access to the string buffer,
+     * and to manipulate the used length directly. */
+    outbytes = tempstr_size = MAX(8, length);
+    outptr = tempstr = (gchar *)g_malloc(outbytes);
+    while (inbytes > 0) {
+        err = g_iconv(cd, (gchar **)&ptr, &inbytes, &outptr, &outbytes);
+        bytes_written = outptr - tempstr;
+        wmem_strbuf_append_len(str, tempstr, bytes_written);
+        outptr = tempstr;
+        outbytes = tempstr_size;
+
+        if (err == (gsize) -1) {
+            /* Errors */
+            switch (errno) {
+                case EINVAL:
+                    /* Incomplete sequence at the end, not an error */
+                    wmem_strbuf_append_unichar(str, UNREPL);
+                    inbytes = 0;
+                    break;
+                case E2BIG:
+                    /* Not enough room (UTF-8 longer than the initial buffer),
+                     * start back at the beginning of the buffer */
+                    break;
+                case EILSEQ:
+                    /* Find the maximal subpart of the ill-formed sequence */
+                    errno = EINVAL;
+                    for (max_subpart = 1; err == (gsize)-1 && errno == EINVAL; max_subpart++) {
+                        tempinbytes = max_subpart;
+                        err = g_iconv(cd, (gchar **)&ptr, &tempinbytes,
+                                &outptr, &outbytes);
+                    }
+                    max_subpart = MAX(1, max_subpart-1);
+                    ptr += max_subpart;
+                    inbytes -= max_subpart;
+                    wmem_strbuf_append_unichar(str, UNREPL);
+                    outptr = tempstr;
+                    outbytes = tempstr_size;
+                    break;
+                default:
+                    /* Unexpected conversion error, unrecoverable */
+                    g_free(tempstr);
+                    g_iconv_close(cd);
+                    REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding);
+                    break;
+            }
+        } else {
+            /* Otherwise err is the number of replacement characters used,
+             * but we don't care about that. */
+            /* If we were converting to ISO-2022-JP or some other stateful
+             * decoder with shift sequences (e.g. EBCDIC mixed-byte), a
+             * final call with NULL input in order to output the shift
+             * sequence back to initial state might make sense, but not
+             * needed for UTF-8. */
+        }
+    }
+
+    g_free(tempstr);
+    g_iconv_close(cd);
+    return (guint8 *) wmem_strbuf_finalize(str);
+}
+
+/*
+ * Given a wmem scope, a pointer, and a length, treat the bytes referred to
+ * by the pointer and length as a GB18030 encoded string, and return a pointer
+ * to a UTF-8 string, allocated using the wmem scope, converted having
+ * substituted REPLACEMENT CHARACTER according to the Unicode Standard
+ * 5.22 U+FFFD Substitution for Conversion.
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ *
+ * As expected, this will also decode GBK and GB2312 strings.
+ */
+guint8 *
+get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
+{
+    /* iconv/libiconv support is guaranteed with GLib. Support this
+     * via iconv, at least for now. */
+    /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since
+     * 2000-10-24 and version 1.4, is there is a system that compiles current
+     * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */
+    const gchar *encoding = "GB18030";
+    GIConv cd;
+    if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
+        encoding = "GBK";
+        /* GB18030 is backwards compatible, at worst this will mean a few
+         * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings
+         * from GB18030, which are all pairs of two byte sequences
+         * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK
+         * and thus the 4 byte characters will be replaced with two
+         * REPLACEMENT CHARACTERs. */
+    } else {
+        g_iconv_close(cd);
+    }
+    return get_string_enc_iconv(scope, ptr, length, encoding);
+}
+
+/*
+ * Given a wmem scope, a pointer, and a length, treat the bytes referred to
+ * by the pointer and length as a EUC-KR encoded string, and return a pointer
+ * to a UTF-8 string, allocated using the wmem scope, converted having
+ * substituted REPLACEMENT CHARACTER according to the Unicode Standard
+ * 5.22 U+FFFD Substitution for Conversion.
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ */
+guint8 *
+get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
+{
+    /* iconv/libiconv support is guaranteed with GLib. Support this
+     * via iconv, at least for now. */
+    return get_string_enc_iconv(scope, ptr, length, "EUC-KR");
+}
+
 /* T.61 to UTF-8 conversion table from OpenLDAP project
 * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD
 */
--- a/epan/charsets.h
+++ b/epan/charsets.h
@ -203,6 +203,30 @@ get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
 WS_DLL_PUBLIC guint8 *
 get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]);

+/*
+ * Given a wmem scope, a pointer, and a length, treat the bytes referred to
+ * by the pointer and length as a GB18030 encoded string, and return a pointer
+ * to a UTF-8 string, allocated using the wmem scope, converted having
+ * substituted REPLACEMENT CHARACTER according to the Unicode Standard
+ * 5.22 U+FFFD Substitution for Conversion.
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ *
+ * As expected, this will also decode GBK and GB2312 strings.
+ */
+WS_DLL_PUBLIC guint8 *
+get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
+
+/*
+ * Given a wmem scope, a pointer, and a length, treat the bytes referred to
+ * by the pointer and length as a EUC-KR encoded string, and return a pointer
+ * to a UTF-8 string, allocated using the wmem scope, converted having
+ * substituted REPLACEMENT CHARACTER according to the Unicode Standard
+ * 5.22 U+FFFD Substitution for Conversion.
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ */
+WS_DLL_PUBLIC guint8 *
+get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
+
 WS_DLL_PUBLIC guint8 *
 get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);

--- a/epan/dissectors/packet-ansi_637.c
+++ b/epan/dissectors/packet-ansi_637.c
@ -377,9 +377,7 @@ text_decoder(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint32 offset
    guint32     required_octs;
    tvbuff_t    *tvb_out = NULL;

-    GIConv      cd;
-    GError      *l_conv_error = NULL;
-    gchar       *ustr = NULL;
+    const guchar *ustr = NULL;

    /*
     * has to be big enough to hold all of the 'shifted' bits
@ -458,25 +456,10 @@ text_decoder(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint32 offset

        offset = 0;

-        if ((cd = g_iconv_open("UTF-8", "EUC-KR")) != (GIConv) -1)
-        {
-            ustr = g_convert_with_iconv(tvb_get_ptr(tvb_out, offset, required_octs), required_octs , cd , NULL , NULL , &l_conv_error);
-            if (!l_conv_error)
-            {
-                proto_tree_add_string(tree, hf_index, tvb_out, offset,
-                    required_octs, ustr);
-            }
-            else
-            {
-                proto_tree_add_expert_format(tree, pinfo, &ei_ansi_637_failed_conversion, tvb_out, offset, required_octs,
-                    "Failed iconv conversion on EUC-KR - (report to wireshark.org)");
-            }
-            if (ustr)
-            {
-                g_free(ustr);
-            }
-            g_iconv_close(cd);
-        }
+        proto_tree_add_item_ret_string(tree, hf_index, tvb_out, offset, required_octs, ENC_EUC_KR|ENC_NA, wmem_packet_scope(), &ustr);
+        if (ustr == NULL)
+            proto_tree_add_expert_format(tree, pinfo, &ei_ansi_637_failed_conversion, tvb_out, offset, required_octs,
+                "Failed iconv conversion on EUC-KR - (report to wireshark.org)");
        break;
    }
 }
--- a/epan/dissectors/packet-bacapp.c
+++ b/epan/dissectors/packet-bacapp.c
@ -8013,6 +8013,27 @@ fCharacterStringBase(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint
             * XXX - are we guaranteed that these encoding
             * names correspond, on *all* platforms with
             * iconv(), to the encodings we want?
+             *
+             * Not necessarily. These specify "character sets" but
+             * not the encodings. IBM/MS DBCS specifies that it uses
+             * some IBM or MS double byte character set, but does not
+             * specify the code page - there was a proposal to explicitly
+             * add the code page, but that was apparently withdrawn in favor
+             * of just deprecating using DBCS, as it never got past a draft
+             * (One problem could be that IBM and MS code pages with the
+             * same number are slightly different, and then there's non
+             * IBM/MS unofficial ones that got used, sometimes conflicting
+             * numbers.) Even if we assume that they certainly mean one
+             * of the DBCS and not just any non ISO-8859-1 code page, there's
+             * all four types of CJK to choose from. -
+             * http://www.bacnet.org/Addenda/Add-135-2004k-PPR1-chair-approved.pdf
+             * JIS C 6226 (now JIS X 0208)
+             * http://www.bacnet.org/Addenda/Add-135-2008k.pdf
+             * is a character set, which are supported by several different
+             * encodings, the main types being ISO-2022-JP (JIS X 0202,
+             * a 7 bit encoding), Shift-JIS (most common), and EUC-JP (UNIX).
+             * It is unclear which encoding this refers to.
+             *
             * If not (and perhaps even if so), we should
             * perhaps have our own iconv() implementation,
             * with a different name, so that we control the
--- a/epan/proto.h
+++ b/epan/proto.h
@ -427,16 +427,29 @@ void proto_report_dissector_bug(const char *format, ...)
 #define ENC_KEYPAD_BC_TBCD                0x00000048 /* Keypad-with-B/C "telephony BCD" = 0-9, B, C, *, # */
 #define ENC_3GPP_TS_23_038_7BITS_UNPACKED 0x0000004C
 #define ENC_ETSI_TS_102_221_ANNEX_A       0x0000004E /* ETSI TS 102 221 Annex A */
+#define ENC_GB18030                       0x00000050
+#define ENC_EUC_KR                        0x00000052
 /*
 * TODO:
 *
- * These could probably be used by existing code:
+ * packet-bacapp.c refers to two currently unsupported character sets (where
+ * we just use ASCII currently):
 *
- *  "IBM MS DBCS"
- *  JIS C 6226
+ *  "IBM MS DBCS" - At the very least could be any IBM/MS Double Byte
+ *      Character Set for CJK (4 major ones), but also could just be any non
+ *      Unicode and non ISO-8859-1 code page. This would be supported via the
+ *      various code pages.
+ *  JIS C 6226 / JIS X 0206 - Does this refer to ISO-2022-JP, SHIFT-JIS, or
+ *      EUC-JP, which are all encoding schemes that support the JIS X 0206
+ *      character set?
 *
 * As those are added, change code such as the code in packet-bacapp.c
 * to use them.
+ *
+ * There's also some other code (e.g., packet-smpp.c) that just ignores
+ * strings if it determines that they are in an unsupported encoding, such
+ * as various encodings of Japanese mentioned above, for example.
+ *
 */

 /*
--- a/epan/tvbuff.c
+++ b/epan/tvbuff.c
@ -2545,6 +2545,13 @@ tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
 * scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT
 * CHARACTER according to the recommended "best practices" given in the Unicode
 * Standard and specified by W3C/WHATWG.
+ *
+ * Note that in conformance with the Unicode Standard, this treats three
+ * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
+ * and two byte overlong encodings of 7-bit ASCII characters as invalid and
+ * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
+ * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
+ * be added later.
 */
 static guint8 *
 tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length)
@ -2764,6 +2771,42 @@ tvb_get_nonascii_unichar2_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint of
 	return get_nonascii_unichar2_string(scope, ptr, length, table);
 }

+/*
+ * Given a wmem scope, a tvbuff, an offset, and a length, treat the bytes
+ * referred to by the tvbuff, offset, and length as a GB18030 encoded string,
+ * and return a pointer to a UTF-8 string, allocated with the wmem scope,
+ * converted having substituted REPLACEMENT CHARACTER according to the
+ * Unicode Standard 5.22 U+FFFD Substitution for Conversion.
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ *
+ * As expected, this will also decode GBK and GB2312 strings.
+ */
+static guint8 *
+tvb_get_gb18030_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
+{
+	const guint8  *ptr;
+
+	ptr = ensure_contiguous(tvb, offset, length);
+	return get_gb18030_string(scope, ptr, length);
+}
+
+/*
+ * Given a wmem scope, a tvbuff, an offset, and a length, treat the bytes
+ * referred to by the tvbuff, offset, and length as a EUC-KR encoded string,
+ * and return a pointer to a UTF-8 string, allocated with the wmem scope,
+ * converted having substituted REPLACEMENT CHARACTER according to the
+ * Unicode Standard 5.22 U+FFFD Substitution for Conversion.
+ * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
+ */
+static guint8 *
+tvb_get_euc_kr_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
+{
+	const guint8  *ptr;
+
+	ptr = ensure_contiguous(tvb, offset, length);
+	return get_euc_kr_string(scope, ptr, length);
+}
+
 static guint8 *
 tvb_get_t61_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length)
 {
@ -2834,12 +2877,6 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
 		break;

 	case ENC_UTF_8:
-		/*
-		 * XXX - should map lead and trail surrogate value code
-		 * points to a "substitute" UTF-8 character?
-		 * XXX - should map code points > 10FFFF to REPLACEMENT
-		 * CHARACTERs.
-		 */
 		strptr = tvb_get_utf_8_string(scope, tvb, offset, length);
 		break;

@ -3021,6 +3058,12 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
 	case ENC_ETSI_TS_102_221_ANNEX_A:
 		strptr = tvb_get_etsi_ts_102_221_annex_a_string(scope, tvb, offset, length);
 		break;
+	case ENC_GB18030:
+		strptr = tvb_get_gb18030_string(scope, tvb, offset, length);
+		break;
+	case ENC_EUC_KR:
+		strptr = tvb_get_euc_kr_string(scope, tvb, offset, length);
+		break;
 	}
 	return strptr;
 }