Add UCS-4 support, and use it.

Shuffle the character ENC_ values around a bit, keeping the Unicode
encodings together, moving the Windows code pages (only one for now)
after the ISO 8859 encodings, and putting "I can't believe it's not
ASCII!" at the end.

Fix some comment typoes, and update another comment, while we're at it.

svn path=/trunk/; revision=54351
This commit is contained in:
Guy Harris 2013-12-22 08:45:57 +00:00
parent 5084f58ce2
commit fc7a77189d
3 changed files with 159 additions and 143 deletions

View File

@ -2310,22 +2310,6 @@ proto_register_bacapp(void);
void
proto_reg_handoff_bacapp(void);
/**
* converts XXX coded strings to UTF-8
* else 'in' is copied to 'out'
* @param in -- pointer to string
* @param inbytesleft size of int bytes
* @param out -- pointer to string
* @param outbytesleft size of out bytes
* @param fromcoding coding type
* @return count of modified characters of returned string, -1 for errors
*/
static guint32
fConvertXXXtoUTF8(gchar *in, gsize *inbytesleft, gchar *out, gsize *outbytesleft, const gchar *fromcoding);
static void
uni_to_string(char * data, gsize str_length, char *dest_buf);
/* <<<< formerly bacapp.h */
/* reassembly table for segmented messages */
@ -6245,9 +6229,8 @@ fCharacterString(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint offs
{
guint8 tag_no, tag_info, character_set;
guint32 lvt, l;
gsize inbytesleft, outbytesleft = 512;
gsize inbytesleft;
guint offs, extra = 1;
guint8 *str_val;
const char *coding;
guint8 bf_arr[512], *out = &bf_arr[0];
proto_item *ti;
@ -6299,8 +6282,7 @@ fCharacterString(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree, guint offs
coding = "JIS C 6226";
break;
case ISO_10646_UCS4:
str_val = tvb_get_string(wmem_packet_scope(), tvb, offset, l);
fConvertXXXtoUTF8(str_val, &inbytesleft, out, &outbytesleft, "UCS-4BE");
out = tvb_get_string_enc(wmem_packet_scope(), tvb, offset, l, ENC_UCS_4|ENC_BIG_ENDIAN);
coding = "ISO 10646 UCS-4";
break;
case ISO_10646_UCS2:
@ -11040,68 +11022,6 @@ bacapp_init_routine(void)
&addresses_reassembly_table_functions);
}
static guint32
fConvertXXXtoUTF8(gchar *in, gsize *inbytesleft, gchar *out, gsize *outbytesleft, const gchar *fromcoding)
{
guint32 i;
GIConv icd;
if ((icd = g_iconv_open("UTF-8", fromcoding)) != (GIConv) -1) {
i = (guint32) g_iconv(icd, &in, inbytesleft, &out, outbytesleft);
/* g_iconv incremented 'out'; now ensure it's NULL terminated */
out[0] = '\0';
g_iconv_close(icd);
return i;
}
uni_to_string(in, *inbytesleft, out);
out[*inbytesleft] = '\0';
*outbytesleft -= *inbytesleft;
*inbytesleft = 0;
return 0;
}
static void
uni_to_string(char * data, gsize str_length, char *dest_buf)
{
gint i;
guint16 c_char;
gsize length_remaining;
length_remaining = str_length;
dest_buf[0] = '\0';
if (str_length == 0) {
return;
}
for ( i = 0; i < (gint) str_length; i++ ) {
c_char = data[i];
if ((c_char < 0x20) || (c_char > 0x7e)) {
if (c_char != 0x00) {
c_char = '.';
dest_buf[i] = c_char & 0xff;
} else {
i--;
str_length--;
}
} else {
dest_buf[i] = c_char & 0xff;
}
length_remaining--;
if (length_remaining == 0) {
dest_buf[i+1] = '\0';
return;
}
}
if (i < 0) {
i = 0;
}
dest_buf[i] = '\0';
return;
}
void
proto_register_bacapp(void)
{

View File

@ -270,10 +270,6 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa
* For UTF-8, invalid UTF-8 sequences should be mapped to the same
* code point.
*
* We also don't process UTF-16 or UCS-2 differently - we don't
* handle surrogate pairs, and don't handle 2-byte values that
* aren't valid in UTF-16 or UCS-2 strings.
*
* For display, perhaps we should also map control characters to the
* Unicode glyphs showing the name of the control character in small
* caps, diagonally. (Unfortunately, those only exist for C0, not C1.)
@ -283,31 +279,31 @@ WS_DLL_PUBLIC WS_MSVC_NORETURN void proto_report_dissector_bug(const char *messa
#define ENC_UTF_8 0x00000002
#define ENC_UTF_16 0x00000004
#define ENC_UCS_2 0x00000006
#define ENC_EBCDIC 0x00000008
#define ENC_WINDOWS_1250 0x0000000A
#define ENC_ISO_8859_1 0x0000000C
#define ENC_ISO_8859_2 0x0000000E
#define ENC_ISO_8859_3 0x00000010
#define ENC_ISO_8859_4 0x00000012
#define ENC_ISO_8859_5 0x00000014
#define ENC_ISO_8859_6 0x00000016
#define ENC_ISO_8859_7 0x00000018
#define ENC_ISO_8859_8 0x0000001A
#define ENC_ISO_8859_9 0x0000001C
#define ENC_ISO_8859_10 0x0000001E
#define ENC_ISO_8859_11 0x00000020
/* #define ENC_ISO_8859_12 0x00000022 ISO 8859-12 was abandoned */
#define ENC_ISO_8859_13 0x00000024
#define ENC_ISO_8859_14 0x00000026
#define ENC_ISO_8859_15 0x00000028
#define ENC_ISO_8859_16 0x0000002A
#define ENC_UCS_4 0x00000008
#define ENC_ISO_8859_1 0x0000000A
#define ENC_ISO_8859_2 0x0000000C
#define ENC_ISO_8859_3 0x0000000E
#define ENC_ISO_8859_4 0x00000010
#define ENC_ISO_8859_5 0x00000012
#define ENC_ISO_8859_6 0x00000014
#define ENC_ISO_8859_7 0x00000016
#define ENC_ISO_8859_8 0x00000018
#define ENC_ISO_8859_9 0x0000001A
#define ENC_ISO_8859_10 0x0000001C
#define ENC_ISO_8859_11 0x0000001E
/* #define ENC_ISO_8859_12 0x00000020 ISO 8859-12 was abandoned */
#define ENC_ISO_8859_13 0x00000022
#define ENC_ISO_8859_14 0x00000024
#define ENC_ISO_8859_15 0x00000026
#define ENC_ISO_8859_16 0x00000028
#define ENC_WINDOWS_1250 0x0000002A
#define ENC_EBCDIC 0x0000002C
/*
* TODO:
*
* These could probably be used by existing code:
*
* ENC_UCS_4 - UCS-4
* - "IBM MS DBCS"
* - JIS C 6226
* 7-bit encodings such as ETSI 03.38 (GSM SMS character set

View File

@ -1924,7 +1924,7 @@ tvb_get_string_unichar2(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gin
* Basic Multilingual Plane (plane 0) of Unicode, return a UTF-8
* string with the same characters.
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
@ -1967,7 +1967,7 @@ tvb_get_ucs_2_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
* Given a UTF-16 encoded Unicode string, return a UTF-8 string with the
* same characters.
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
@ -2073,6 +2073,50 @@ tvb_get_utf_16_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Given a UCS-4-encoded Unicode string, return a UTF-8 string with the
* same characters.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
* If scope is NULL, memory is allocated with g_malloc() and user must
* explicitely free it with g_free().
* If scope is not NULL, memory is allocated with the corresponding pool
* lifetime.
*
* XXX - should map lead and trail surrogate values, and code points beyond
* the maximum Unicode character, to a "substitute" UTF-8 character?
*/
static gchar *
tvb_get_ucs_4_string(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint length, const guint encoding)
{
gunichar uchar;
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
tvb_ensure_bytes_exist(tvb, offset, length);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i + 3 < length; i += 2) {
if (encoding == ENC_BIG_ENDIAN)
uchar = tvb_get_ntohl(tvb, offset + i);
else
uchar = tvb_get_letohl(tvb, offset + i);
wmem_strbuf_append_unichar(strbuf, uchar);
}
/*
* XXX - if i < length, this means we were handed a number
* of bytes that's not a multiple of 4, so we're not a valid
* UCS-4 string.
*/
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Given a tvbuff, an offset, a length, and an encoding, allocate a
* buffer big enough to hold a non-null-terminated string of that length
@ -2131,24 +2175,9 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_EBCDIC:
/*
* XXX - do the copy and conversion in one pass.
*
* XXX - multiple "dialects" of EBCDIC?
*/
tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
strbuf = (guint8 *)wmem_alloc(scope, length + 1);
if (length != 0) {
ptr = ensure_contiguous(tvb, offset, length);
memcpy(strbuf, ptr, length);
EBCDIC_to_ASCII(strbuf, length);
}
strbuf[length] = '\0';
break;
case ENC_WINDOWS_1250:
strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250);
case ENC_UCS_4:
strbuf = tvb_get_ucs_4_string(scope, tvb, offset, length,
encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_ISO_8859_1:
@ -2215,6 +2244,26 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
case ENC_ISO_8859_16:
strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_iso_8859_16);
break;
case ENC_WINDOWS_1250:
strbuf = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp1250);
break;
case ENC_EBCDIC:
/*
* XXX - do the copy and conversion in one pass.
*
* XXX - multiple "dialects" of EBCDIC?
*/
tvb_ensure_bytes_exist(tvb, offset, length); /* make sure length = -1 fails */
strbuf = (guint8 *)wmem_alloc(scope, length + 1);
if (length != 0) {
ptr = ensure_contiguous(tvb, offset, length);
memcpy(strbuf, ptr, length);
EBCDIC_to_ASCII(strbuf, length);
}
strbuf[length] = '\0';
break;
}
return strbuf;
}
@ -2298,7 +2347,7 @@ tvb_get_const_stringz(tvbuff_t *tvb, const gint offset, gint *lengthp)
* Version of tvb_get_stringz() that handles the Basic Multilingual Plane
* (plane 0) of Unicode, with each code point encoded in 16 bits.
*
* Encoding paramter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*
@ -2310,7 +2359,7 @@ static gchar *
tvb_get_ucs_2_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gunichar2 uchar;
gint size; /* Number of UTF-16 characters */
gint size; /* Number of bytes in string */
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
@ -2357,6 +2406,52 @@ tvb_get_utf_16_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset
return (gchar*)wmem_strbuf_get_str(strbuf);
}
/*
* Version of tvb_get_stringz() that handles Unicode, with each code point
* encoded in 32 bits.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Returns an allocated UTF-8 string and updates lengthp pointer with length of string (in bytes)
*
* XXX - needs to map values that are not valid Unicode characters (such as,
* I think, values used as the components of a UTF-16 surrogate pair) to a
* "substitute" UTF-8 character.
*/
static gchar *
tvb_get_ucs_4_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
gunichar uchar;
gint size; /* Number of bytes in string */
gint i; /* Byte counter for tvbuff */
wmem_strbuf_t *strbuf;
DISSECTOR_ASSERT(tvb && tvb->initialized);
size = 0;
do {
/* Endianness doesn't matter when looking for null */
uchar = tvb_get_ntohl(tvb, offset + size);
size += 4;
} while(uchar != 0);
strbuf = wmem_strbuf_new(scope, NULL);
for(i = 0; i < size; i += 4) {
if (encoding == ENC_BIG_ENDIAN)
uchar = tvb_get_ntohl(tvb, offset + i);
else
uchar = tvb_get_letohl(tvb, offset + i);
wmem_strbuf_append_unichar(strbuf, uchar);
}
if (lengthp)
*lengthp = i; /* Number of *bytes* processed */
return (gchar*)wmem_strbuf_get_str(strbuf);
}
guint8 *
tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp, const guint encoding)
{
@ -2400,22 +2495,9 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_EBCDIC:
/*
* XXX - do the copy and conversion in one pass.
*
* XXX - multiple "dialects" of EBCDIC?
*/
size = tvb_strsize(tvb, offset);
strptr = (guint8 *)wmem_alloc(scope, size);
tvb_memcpy(tvb, strptr, offset, size);
EBCDIC_to_ASCII(strptr, size);
if (lengthp)
*lengthp = size;
break;
case ENC_WINDOWS_1250:
strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250);
case ENC_UCS_4:
strptr = tvb_get_ucs_4_stringz(scope, tvb, offset, lengthp,
encoding & ENC_LITTLE_ENDIAN);
break;
case ENC_ISO_8859_1:
@ -2482,6 +2564,24 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
case ENC_ISO_8859_16:
strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_iso_8859_16);
break;
case ENC_WINDOWS_1250:
strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp1250);
break;
case ENC_EBCDIC:
/*
* XXX - do the copy and conversion in one pass.
*
* XXX - multiple "dialects" of EBCDIC?
*/
size = tvb_strsize(tvb, offset);
strptr = (guint8 *)wmem_alloc(scope, size);
tvb_memcpy(tvb, strptr, offset, size);
EBCDIC_to_ASCII(strptr, size);
if (lengthp)
*lengthp = size;
break;
}
return strptr;