diff --git a/debian/libwireshark0.symbols b/debian/libwireshark0.symbols index afc5fcfb28..7866ffb655 100644 --- a/debian/libwireshark0.symbols +++ b/debian/libwireshark0.symbols @@ -798,6 +798,7 @@ libwireshark.so.0 libwireshark0 #MINVER# get_eth_hashtable@Base 1.12.0~rc1 get_ether_name@Base 1.9.1 get_etsi_ts_102_221_annex_a_string@Base 3.3.1 + get_euc_kr_string@Base 3.3.2 get_follow_address_func@Base 2.1.0 get_follow_by_name@Base 2.1.0 get_follow_conv_func@Base 2.1.0 @@ -807,6 +808,7 @@ libwireshark.so.0 libwireshark0 #MINVER# get_follow_tap_handler@Base 2.1.0 get_follow_tap_string@Base 2.1.0 get_export_pdu_tap_list@Base 1.99.0 + get_gb18030_string@Base 3.3.2 get_hash_ether_status@Base 1.99.3 get_hash_ether_hexaddr@Base 1.99.3 get_hash_ether_resolved_name@Base 1.99.3 diff --git a/doc/README.dissector b/doc/README.dissector index 845b138b2c..e364bd9785 100644 --- a/doc/README.dissector +++ b/doc/README.dissector @@ -1759,9 +1759,13 @@ currently supported are: ENC_ISO_8859_14 - ISO 8859-14 ENC_ISO_8859_15 - ISO 8859-15 ENC_ISO_8859_16 - ISO 8859-16 - ENC_WINDOWS_1250 - Windows-1250 ENC_3GPP_TS_23_038_7BITS - GSM 7 bits alphabet as described in 3GPP TS 23.038 + ENC_3GPP_TS_23_038_7BITS_UNPACKED - GSM 7 bits alphabet where each + 7 bit character occupies a distinct octet + ENC_ETSI_TS_102_221_ANNEX_A - Coding scheme for SIM cards with GSM 7 bit + alphabet, UCS-2 characters, or a mixture of the two as described + in ETSI TS 102 221 Annex A ENC_EBCDIC - EBCDIC ENC_EBCDIC_CP037 - EBCDIC code page 037 ENC_MAC_ROMAN - MAC ROMAN @@ -1773,6 +1777,8 @@ currently supported are: ENC_BCD_DIGITS_0_9 - packed BCD (one digit per nibble), digits 0-9 ENC_KEYPAD_ABC_TBCD - keypad-with-a/b/c "telephony packed BCD" = 0-9, *, #, a, b, c ENC_KEYPAD_BC_TBCD - keypad-with-B/C "telephony packed BCD" = 0-9, B, C, *, # + ENC_GB18030 - GB 18030 + ENC_EUC_KR - EUC-KR Other encodings will be added in the future. diff --git a/epan/charsets.c b/epan/charsets.c index cbc9b712a9..a49cb127ae 100644 --- a/epan/charsets.c +++ b/epan/charsets.c @@ -10,6 +10,7 @@ #include "config.h" +#include #include #include @@ -91,6 +92,13 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length) * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER * according to the recommended "best practices" given in the Unicode * Standard and specified by W3C/WHATWG. + * + * Note that in conformance with the Unicode Standard, this treats three + * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired) + * and two byte overlong encodings of 7-bit ASCII characters as invalid and + * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard + * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could + * be added later. */ guint8 * get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length) @@ -1552,6 +1560,151 @@ get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint le return (guint8 *) wmem_strbuf_finalize(str); } +/* + * Given a wmem scope, a pointer, a length, and a string referring to an + * encoding (recognized by iconv), treat the bytes referred to by the pointer + * and length as a string in that encoding, and return a pointer to a UTF-8 + * string, allocated using the wmem scope, converted from the original + * encoding having substituted REPLACEMENT CHARACTER according to the + * Unicode Standard 5.22 U+FFFD Substitution for Conversion + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + */ +static guint8 * +get_string_enc_iconv(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gchar *encoding) +{ + GIConv cd; + gsize inbytes, outbytes; + gsize tempstr_size, bytes_written; + gsize err; + gsize max_subpart, tempinbytes; + gchar *outptr, *tempstr; + + wmem_strbuf_t *str; + + if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) { + REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding); + /* Most likely to be a programming error passing in a bad encoding + * name. However, could be a issue with the iconv support on the + * system running WS. GLib requires iconv/libiconv, but is it possible + * that some versions don't support all common encodings? */ + } + + inbytes = length; + str = wmem_strbuf_sized_new(scope, length+1, 0); + /* XXX: If speed becomes an issue, the faster way to do this would + * involve passing the wmem_strbuf_t's string buffer directly into + * g_iconv to avoid a memcpy later, but that requires changes to the + * wmem_strbuf interface to have non const access to the string buffer, + * and to manipulate the used length directly. */ + outbytes = tempstr_size = MAX(8, length); + outptr = tempstr = (gchar *)g_malloc(outbytes); + while (inbytes > 0) { + err = g_iconv(cd, (gchar **)&ptr, &inbytes, &outptr, &outbytes); + bytes_written = outptr - tempstr; + wmem_strbuf_append_len(str, tempstr, bytes_written); + outptr = tempstr; + outbytes = tempstr_size; + + if (err == (gsize) -1) { + /* Errors */ + switch (errno) { + case EINVAL: + /* Incomplete sequence at the end, not an error */ + wmem_strbuf_append_unichar(str, UNREPL); + inbytes = 0; + break; + case E2BIG: + /* Not enough room (UTF-8 longer than the initial buffer), + * start back at the beginning of the buffer */ + break; + case EILSEQ: + /* Find the maximal subpart of the ill-formed sequence */ + errno = EINVAL; + for (max_subpart = 1; err == (gsize)-1 && errno == EINVAL; max_subpart++) { + tempinbytes = max_subpart; + err = g_iconv(cd, (gchar **)&ptr, &tempinbytes, + &outptr, &outbytes); + } + max_subpart = MAX(1, max_subpart-1); + ptr += max_subpart; + inbytes -= max_subpart; + wmem_strbuf_append_unichar(str, UNREPL); + outptr = tempstr; + outbytes = tempstr_size; + break; + default: + /* Unexpected conversion error, unrecoverable */ + g_free(tempstr); + g_iconv_close(cd); + REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding); + break; + } + } else { + /* Otherwise err is the number of replacement characters used, + * but we don't care about that. */ + /* If we were converting to ISO-2022-JP or some other stateful + * decoder with shift sequences (e.g. EBCDIC mixed-byte), a + * final call with NULL input in order to output the shift + * sequence back to initial state might make sense, but not + * needed for UTF-8. */ + } + } + + g_free(tempstr); + g_iconv_close(cd); + return (guint8 *) wmem_strbuf_finalize(str); +} + +/* + * Given a wmem scope, a pointer, and a length, treat the bytes referred to + * by the pointer and length as a GB18030 encoded string, and return a pointer + * to a UTF-8 string, allocated using the wmem scope, converted having + * substituted REPLACEMENT CHARACTER according to the Unicode Standard + * 5.22 U+FFFD Substitution for Conversion. + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + * + * As expected, this will also decode GBK and GB2312 strings. + */ +guint8 * +get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length) +{ + /* iconv/libiconv support is guaranteed with GLib. Support this + * via iconv, at least for now. */ + /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since + * 2000-10-24 and version 1.4, is there is a system that compiles current + * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */ + const gchar *encoding = "GB18030"; + GIConv cd; + if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) { + encoding = "GBK"; + /* GB18030 is backwards compatible, at worst this will mean a few + * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings + * from GB18030, which are all pairs of two byte sequences + * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK + * and thus the 4 byte characters will be replaced with two + * REPLACEMENT CHARACTERs. */ + } else { + g_iconv_close(cd); + } + return get_string_enc_iconv(scope, ptr, length, encoding); +} + +/* + * Given a wmem scope, a pointer, and a length, treat the bytes referred to + * by the pointer and length as a EUC-KR encoded string, and return a pointer + * to a UTF-8 string, allocated using the wmem scope, converted having + * substituted REPLACEMENT CHARACTER according to the Unicode Standard + * 5.22 U+FFFD Substitution for Conversion. + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + */ +guint8 * +get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length) +{ + /* iconv/libiconv support is guaranteed with GLib. Support this + * via iconv, at least for now. */ + return get_string_enc_iconv(scope, ptr, length, "EUC-KR"); +} + /* T.61 to UTF-8 conversion table from OpenLDAP project * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD */ diff --git a/epan/charsets.h b/epan/charsets.h index 9c842758b6..d5099119d8 100644 --- a/epan/charsets.h +++ b/epan/charsets.h @@ -203,6 +203,30 @@ get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr, WS_DLL_PUBLIC guint8 * get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]); +/* + * Given a wmem scope, a pointer, and a length, treat the bytes referred to + * by the pointer and length as a GB18030 encoded string, and return a pointer + * to a UTF-8 string, allocated using the wmem scope, converted having + * substituted REPLACEMENT CHARACTER according to the Unicode Standard + * 5.22 U+FFFD Substitution for Conversion. + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + * + * As expected, this will also decode GBK and GB2312 strings. + */ +WS_DLL_PUBLIC guint8 * +get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length); + +/* + * Given a wmem scope, a pointer, and a length, treat the bytes referred to + * by the pointer and length as a EUC-KR encoded string, and return a pointer + * to a UTF-8 string, allocated using the wmem scope, converted having + * substituted REPLACEMENT CHARACTER according to the Unicode Standard + * 5.22 U+FFFD Substitution for Conversion. + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + */ +WS_DLL_PUBLIC guint8 * +get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length); + WS_DLL_PUBLIC guint8 * get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length); diff --git a/epan/dissectors/packet-ansi_637.c b/epan/dissectors/packet-ansi_637.c index 97a760e5ed..e633846e2d 100644 --- a/epan/dissectors/packet-ansi_637.c +++ b/epan/dissectors/packet-ansi_637.c @@ -377,9 +377,7 @@ text_decoder(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint32 offset guint32 required_octs; tvbuff_t *tvb_out = NULL; - GIConv cd; - GError *l_conv_error = NULL; - gchar *ustr = NULL; + const guchar *ustr = NULL; /* * has to be big enough to hold all of the 'shifted' bits @@ -458,25 +456,10 @@ text_decoder(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint32 offset offset = 0; - if ((cd = g_iconv_open("UTF-8", "EUC-KR")) != (GIConv) -1) - { - ustr = g_convert_with_iconv(tvb_get_ptr(tvb_out, offset, required_octs), required_octs , cd , NULL , NULL , &l_conv_error); - if (!l_conv_error) - { - proto_tree_add_string(tree, hf_index, tvb_out, offset, - required_octs, ustr); - } - else - { - proto_tree_add_expert_format(tree, pinfo, &ei_ansi_637_failed_conversion, tvb_out, offset, required_octs, - "Failed iconv conversion on EUC-KR - (report to wireshark.org)"); - } - if (ustr) - { - g_free(ustr); - } - g_iconv_close(cd); - } + proto_tree_add_item_ret_string(tree, hf_index, tvb_out, offset, required_octs, ENC_EUC_KR|ENC_NA, wmem_packet_scope(), &ustr); + if (ustr == NULL) + proto_tree_add_expert_format(tree, pinfo, &ei_ansi_637_failed_conversion, tvb_out, offset, required_octs, + "Failed iconv conversion on EUC-KR - (report to wireshark.org)"); break; } } diff --git a/epan/dissectors/packet-bacapp.c b/epan/dissectors/packet-bacapp.c index a4574df90b..261b4efbba 100644 --- a/epan/dissectors/packet-bacapp.c +++ b/epan/dissectors/packet-bacapp.c @@ -8013,6 +8013,27 @@ fCharacterStringBase(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint * XXX - are we guaranteed that these encoding * names correspond, on *all* platforms with * iconv(), to the encodings we want? + * + * Not necessarily. These specify "character sets" but + * not the encodings. IBM/MS DBCS specifies that it uses + * some IBM or MS double byte character set, but does not + * specify the code page - there was a proposal to explicitly + * add the code page, but that was apparently withdrawn in favor + * of just deprecating using DBCS, as it never got past a draft + * (One problem could be that IBM and MS code pages with the + * same number are slightly different, and then there's non + * IBM/MS unofficial ones that got used, sometimes conflicting + * numbers.) Even if we assume that they certainly mean one + * of the DBCS and not just any non ISO-8859-1 code page, there's + * all four types of CJK to choose from. - + * http://www.bacnet.org/Addenda/Add-135-2004k-PPR1-chair-approved.pdf + * JIS C 6226 (now JIS X 0208) + * http://www.bacnet.org/Addenda/Add-135-2008k.pdf + * is a character set, which are supported by several different + * encodings, the main types being ISO-2022-JP (JIS X 0202, + * a 7 bit encoding), Shift-JIS (most common), and EUC-JP (UNIX). + * It is unclear which encoding this refers to. + * * If not (and perhaps even if so), we should * perhaps have our own iconv() implementation, * with a different name, so that we control the diff --git a/epan/proto.h b/epan/proto.h index 679cb6989c..ea43c9afbf 100644 --- a/epan/proto.h +++ b/epan/proto.h @@ -427,16 +427,29 @@ void proto_report_dissector_bug(const char *format, ...) #define ENC_KEYPAD_BC_TBCD 0x00000048 /* Keypad-with-B/C "telephony BCD" = 0-9, B, C, *, # */ #define ENC_3GPP_TS_23_038_7BITS_UNPACKED 0x0000004C #define ENC_ETSI_TS_102_221_ANNEX_A 0x0000004E /* ETSI TS 102 221 Annex A */ +#define ENC_GB18030 0x00000050 +#define ENC_EUC_KR 0x00000052 /* * TODO: * - * These could probably be used by existing code: + * packet-bacapp.c refers to two currently unsupported character sets (where + * we just use ASCII currently): * - * "IBM MS DBCS" - * JIS C 6226 + * "IBM MS DBCS" - At the very least could be any IBM/MS Double Byte + * Character Set for CJK (4 major ones), but also could just be any non + * Unicode and non ISO-8859-1 code page. This would be supported via the + * various code pages. + * JIS C 6226 / JIS X 0206 - Does this refer to ISO-2022-JP, SHIFT-JIS, or + * EUC-JP, which are all encoding schemes that support the JIS X 0206 + * character set? * * As those are added, change code such as the code in packet-bacapp.c * to use them. + * + * There's also some other code (e.g., packet-smpp.c) that just ignores + * strings if it determines that they are in an unsupported encoding, such + * as various encodings of Japanese mentioned above, for example. + * */ /* diff --git a/epan/tvbuff.c b/epan/tvbuff.c index 3fff345d5a..7b4972fdbc 100644 --- a/epan/tvbuff.c +++ b/epan/tvbuff.c @@ -2545,6 +2545,13 @@ tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint * scope, with all ill-formed sequences replaced with the Unicode REPLACEMENT * CHARACTER according to the recommended "best practices" given in the Unicode * Standard and specified by W3C/WHATWG. + * + * Note that in conformance with the Unicode Standard, this treats three + * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired) + * and two byte overlong encodings of 7-bit ASCII characters as invalid and + * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard + * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could + * be added later. */ static guint8 * tvb_get_utf_8_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const gint length) @@ -2764,6 +2771,42 @@ tvb_get_nonascii_unichar2_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint of return get_nonascii_unichar2_string(scope, ptr, length, table); } +/* + * Given a wmem scope, a tvbuff, an offset, and a length, treat the bytes + * referred to by the tvbuff, offset, and length as a GB18030 encoded string, + * and return a pointer to a UTF-8 string, allocated with the wmem scope, + * converted having substituted REPLACEMENT CHARACTER according to the + * Unicode Standard 5.22 U+FFFD Substitution for Conversion. + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + * + * As expected, this will also decode GBK and GB2312 strings. + */ +static guint8 * +tvb_get_gb18030_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) +{ + const guint8 *ptr; + + ptr = ensure_contiguous(tvb, offset, length); + return get_gb18030_string(scope, ptr, length); +} + +/* + * Given a wmem scope, a tvbuff, an offset, and a length, treat the bytes + * referred to by the tvbuff, offset, and length as a EUC-KR encoded string, + * and return a pointer to a UTF-8 string, allocated with the wmem scope, + * converted having substituted REPLACEMENT CHARACTER according to the + * Unicode Standard 5.22 U+FFFD Substitution for Conversion. + * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf ) + */ +static guint8 * +tvb_get_euc_kr_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) +{ + const guint8 *ptr; + + ptr = ensure_contiguous(tvb, offset, length); + return get_euc_kr_string(scope, ptr, length); +} + static guint8 * tvb_get_t61_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length) { @@ -2834,12 +2877,6 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, break; case ENC_UTF_8: - /* - * XXX - should map lead and trail surrogate value code - * points to a "substitute" UTF-8 character? - * XXX - should map code points > 10FFFF to REPLACEMENT - * CHARACTERs. - */ strptr = tvb_get_utf_8_string(scope, tvb, offset, length); break; @@ -3021,6 +3058,12 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, case ENC_ETSI_TS_102_221_ANNEX_A: strptr = tvb_get_etsi_ts_102_221_annex_a_string(scope, tvb, offset, length); break; + case ENC_GB18030: + strptr = tvb_get_gb18030_string(scope, tvb, offset, length); + break; + case ENC_EUC_KR: + strptr = tvb_get_euc_kr_string(scope, tvb, offset, length); + break; } return strptr; }