From 113b078a4d80a2aee7dbecb7724aebbec5de2dfc Mon Sep 17 00:00:00 2001 From: Jakub Zawadzki Date: Sat, 7 Dec 2013 10:10:03 +0000 Subject: [PATCH] Add new string proto encoding for windows-1250 (ENC_WINDOWS_1250) - Move windows-1250 to unicode encoding table to charset.c - Add tvb_get_string_unichar2, tvb_get_stringz_unichar2 functions which recode tvb-string to UTF-8. svn path=/trunk/; revision=53819 --- epan/charsets.c | 23 ++++++++++++++ epan/charsets.h | 3 ++ epan/dissectors/packet-gadu-gadu.c | 48 +++++------------------------- epan/proto.h | 1 + epan/tvbuff.c | 41 +++++++++++++++++++++++++ 5 files changed, 75 insertions(+), 41 deletions(-) diff --git a/epan/charsets.c b/epan/charsets.c index 1c5b79ac4b..f3406b6078 100644 --- a/epan/charsets.c +++ b/epan/charsets.c @@ -167,3 +167,26 @@ EBCDIC_to_ASCII1(guint8 c) { return EBCDIC_translate_ASCII[c]; } + +/* REPLACEMENT CHARACTER */ +#define UNREPL 0xFFFD + +/* Windows-1250 */ +const gunichar2 charset_table_cp1250[0x80] = { + 0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */ + UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* - 0x8F */ + UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */ + UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* - 0x9F */ + 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, /* 0xA0 - */ + 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* - 0xAF */ + 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ + 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* - 0xBF */ + 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */ + 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */ + 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */ + 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */ + 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */ + 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */ + 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */ + 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, /* - 0xFF */ +}; diff --git a/epan/charsets.h b/epan/charsets.h index 345c7bb99a..0764eac600 100644 --- a/epan/charsets.h +++ b/epan/charsets.h @@ -39,6 +39,9 @@ void EBCDIC_to_ASCII(guint8 *buf, guint bytes); WS_DLL_PUBLIC guint8 EBCDIC_to_ASCII1(guint8 c); +/* Table for windows-1250 */ +extern const gunichar2 charset_table_cp1250[0x80]; + #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/epan/dissectors/packet-gadu-gadu.c b/epan/dissectors/packet-gadu-gadu.c index ebf688cb84..3b64efa55e 100644 --- a/epan/dissectors/packet-gadu-gadu.c +++ b/epan/dissectors/packet-gadu-gadu.c @@ -582,51 +582,17 @@ gadu_gadu_status_has_descr(int status) } static int -dissect_gadu_gadu_stringz_cp1250(tvbuff_t *tvb, const header_field_info *hfi, proto_tree *tree, int offset) +dissect_gadu_gadu_stringz_cp1250(tvbuff_t *tvb, const header_field_info *hfi, proto_tree *tree, const int offset) { - static const gunichar2 table_cp1250[] = { - 0x20ac, 0xFFFD, 0x201a, 0xFFFD, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */ - 0xFFFD, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* - 0x8F */ - 0xFFFD, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */ - 0xFFFD, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* - 0x9F */ - 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, /* 0xA0 - */ - 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* - 0xAF */ - 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */ - 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* - 0xBF */ - 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */ - 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */ - 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */ - 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */ - 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */ - 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */ - 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */ - 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, /* - 0xFF */ - }; + const char *str; + int len; - const int org_offset = offset; + /* XXX, new code is throwing exception if string is not NUL terminated */ + str = (const char *) tvb_get_stringz_enc(wmem_packet_scope(), tvb, offset, &len, ENC_NA | ENC_WINDOWS_1250); - wmem_strbuf_t *str; - guint8 ch; - gint len; + proto_tree_add_unicode_string(tree, hfi->id, tvb, offset, len, str); - len = tvb_reported_length_remaining(tvb, offset); - - str = wmem_strbuf_new(wmem_packet_scope(), ""); - - while ((len > 0) && (ch = tvb_get_guint8(tvb, offset))) { - if (ch < 0x80) - wmem_strbuf_append_c(str, ch); - else - wmem_strbuf_append_unichar(str, table_cp1250[ch-0x80]); - offset++; - len--; - } - if (len > 0) - offset++; /* NUL */ - - proto_tree_add_unicode_string(tree, hfi->id, tvb, org_offset, offset - org_offset, wmem_strbuf_get_str(str)); - - return offset; + return offset + len; } static int diff --git a/epan/proto.h b/epan/proto.h index a9841a9899..d3badde030 100644 --- a/epan/proto.h +++ b/epan/proto.h @@ -284,6 +284,7 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa #define ENC_UTF_16 0x00000004 #define ENC_UCS_2 0x00000006 #define ENC_EBCDIC 0x00000008 +#define ENC_WINDOWS_1250 0x0000000A /* http://en.wikipedia.org/wiki/Windows-1250 */ /* * TODO: diff --git a/epan/tvbuff.c b/epan/tvbuff.c index c726116241..3180ec0794 100644 --- a/epan/tvbuff.c +++ b/epan/tvbuff.c @@ -1807,6 +1807,28 @@ tvb_get_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, const return strbuf; } +static guint8 * +tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length, const gunichar2 table[0x80]) +{ + wmem_strbuf_t *str; + + str = wmem_strbuf_new(scope, ""); + + while (length > 0) { + guint8 ch = tvb_get_guint8(tvb, offset); + + if (ch < 0x80) + wmem_strbuf_append_c(str, ch); + else + wmem_strbuf_append_unichar(str, table[ch-0x80]); + offset++; + length--; + } + + /* XXX, discarding constiness, should we have some function which "take-over" strbuf->str (like when strbuf is no longer needed) */ + return (guint8 *) wmem_strbuf_get_str(str); +} + /* * Unicode (UTF-16) version of tvb_get_string() * XXX - this is UCS-2, not UTF-16, as it doesn't handle surrogate pairs @@ -1885,6 +1907,10 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, strbuf = tvb_get_string(scope, tvb, offset, length); break; + case ENC_WINDOWS_1250: + strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250); + break; + case ENC_UTF_8: /* * XXX - should map all invalid UTF-8 sequences @@ -1960,6 +1986,17 @@ tvb_get_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint return strptr; } +static guint8 * +tvb_get_stringz_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint *lengthp, const gunichar2 table[0x80]) +{ + guint size; + + /* XXX, convertion between signed/unsigned integer */ + *lengthp = size = tvb_strsize(tvb, offset); + + return tvb_get_string_unichar2(scope, tvb, offset, size, table); +} + guint8 * tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding) { @@ -1985,6 +2022,10 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g strptr = tvb_get_stringz(scope, tvb, offset, lengthp); break; + case ENC_WINDOWS_1250: + strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250); + break; + case ENC_UTF_8: /* * XXX - should map all invalid UTF-8 sequences