Add support for the ISO 646 "Basic code table" encoding.

The "Basic code table" in ISO 646 is mostly ASCII, but some code points
either 1) have more than one glyph that can be assigned to them or 2)
have no glyph assigned to them.  National versions choose one of the two
glyphs for the code points in group 1) and assign specific glyphs to the
code points in group 2); the International Reference Version assigns the
same glyphs to those code points as does ASCII.

For the "Basic code table" encoding, we map the code points in groups 1)
and 2) to a REPLACEMENT CHARACTER; additional encodings can be added for
the national versions.

Add ENC_ISO_646_IRV (International Reference Version) as an alias for
ENC_ASCII.

Expand some comments, and add some comments, while we're at it.

Change-Id: I4f1b5e426ec193775e919731c5cae1224dc65115
Reviewed-on: https://code.wireshark.org/review/33941
Petri-Dish: Guy Harris <guy@alum.mit.edu>
Tested-by: Petri Dish Buildbot
Reviewed-by: Guy Harris <guy@alum.mit.edu>
This commit is contained in:
Guy Harris 2019-07-14 20:18:14 -07:00
parent 258a5f6a17
commit e26e0b4de0
5 changed files with 197 additions and 9 deletions

View File

@ -812,6 +812,7 @@ libwireshark.so.0 libwireshark0 #MINVER#
get_ipv4_hash_table@Base 1.12.0~rc1
get_ipv6_hash_table@Base 1.12.0~rc1
get_ipxnet_hash_table@Base 1.12.0~rc1
get_iso_646_string@Base 3.1.0
get_key_string@Base 1.9.1
get_mac_lte_proto_data@Base 1.9.1
get_mac_nr_proto_data@Base 2.5.2

View File

@ -80,6 +80,59 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
return (guint8 *) wmem_strbuf_finalize(str);
}
/*
* ISO 646 "Basic code table".
*/
const gunichar2 charset_table_iso_646_basic[0x80] = {
0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, /* 0x00 - */
0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* - 0x0F */
0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, /* 0x10 - */
0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* - 0x1F */
0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027, /* 0x20 - */
0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* - 0x2F */
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x30 - */
0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* - 0x3F */
UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x40 - */
0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* - 0x4F */
0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x50 - */
0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f, /* - 0x5F */
UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x60 - */
0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* - 0x6F */
0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x70 - */
0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f, /* - 0x7F */
};
/*
* Given a wmem scope, a pointer, a length, and a translation table,
* treat the string of bytes referred to by the pointer and length as a
* string encoded using one octet per character, with octets with the
* high-order bit clear being mapped by the translation table to 2-byte
* Unicode Basic Multilingual Plane characters (including REPLACEMENT
* CHARACTER) and octets with the high-order bit set being mapped to
* REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
* allocated using the wmem scope.
*/
guint8 *
get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
{
wmem_strbuf_t *str;
str = wmem_strbuf_sized_new(scope, length+1, 0);
while (length > 0) {
guint8 ch = *ptr;
if (ch < 0x80)
wmem_strbuf_append_unichar(str, table[ch]);
else
wmem_strbuf_append_unichar(str, UNREPL);
ptr++;
length--;
}
return (guint8 *) wmem_strbuf_finalize(str);
}
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as an ISO 8859/1 string, and
@ -577,11 +630,11 @@ const gunichar2 charset_table_cp866[0x80] = {
};
/*
* Given a wmem scope, a pointer, a length, and a translation table,
* treat the string of bytes referred to by the pointer and length as a
* string encoded using one octet per character, with octets with the
* high-order bit clear being ASCII and octets with the high-order bit
* set being mapped by the translation table to 2-byte Unicode Basic
* Given a wmem scope, a pointer, a length, and a translation table with
* 128 entries, treat the string of bytes referred to by the pointer and
* length as a string encoded using one octet per character, with octets
* with the high-order bit clear being ASCII and octets with the high-order
* bit set being mapped by the translation table to 2-byte Unicode Basic
* Multilingual Plane characters (including REPLACEMENT CHARACTER), and
* return a pointer to a UTF-8 string, allocated using the wmem scope.
*/
@ -1196,10 +1249,10 @@ const gunichar2 charset_table_ebcdic_cp037[256] = {
};
/*
* Given a wmem scope, a pointer, a length, and a translation table,
* treat the string of bytes referred to by the pointer and length as a
* string encoded using one octet per character, with octets being
* mapped by the translation table to 2-byte Unicode Basic Multilingual
* Given a wmem scope, a pointer, a length, and a translation table with
* 256 entries, treat the string of bytes referred to by the pointer and
* length as a string encoded using one octet per character, with octets
* being mapped by the translation table to 2-byte Unicode Basic Multilingual
* Plane characters (including REPLACEMENT CHARACTER), and return a
* pointer to a UTF-8 string, allocated using the wmem scope.
*/

View File

@ -53,6 +53,12 @@ extern const gunichar2 charset_table_cp437[0x80];
extern const gunichar2 charset_table_cp855[0x80];
extern const gunichar2 charset_table_cp866[0x80];
/*
* Translation tables that map the lower 128 code points in single-byte
* ISO 646-based character encodings to Unicode code points in the
* Basic Multilingual Plane.
*/
extern const gunichar2 charset_table_iso_646_basic[0x80];
/* Tables for EBCDIC code pages */
extern const gunichar2 charset_table_ebcdic[256];
@ -70,18 +76,92 @@ extern const gunichar2 charset_table_ebcdic_cp037[256];
WS_DLL_PUBLIC guint8 *
get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
/*
* Given a wmem scope, a pointer, a length, and a translation table,
* treat the string of bytes referred to by the pointer and length as a
* string encoded using one octet per character, with octets with the
* high-order bit clear being mapped by the translation table to 2-byte
* Unicode Basic Multilingual Plane characters (including REPLACEMENT
* CHARACTER) and octets with the high-order bit set being mapped to
* REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
* allocated using the wmem scope.
*/
WS_DLL_PUBLIC guint8 *
get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as an ISO 8859/1 string, and
* return a pointer to a UTF-8 string, allocated using the wmem scope.
*/
WS_DLL_PUBLIC guint8 *
get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
/*
* Given a wmem scope, a pointer, a length, and a translation table with
* 128 entries, treat the string of bytes referred to by the pointer and
* length as a string encoded using one octet per character, with octets
* with the high-order bit clear being ASCII and octets with the high-order
* bit set being mapped by the translation table to 2-byte Unicode Basic
* Multilingual Plane characters (including REPLACEMENT CHARACTER), and
* return a pointer to a UTF-8 string, allocated using the wmem scope.
*/
WS_DLL_PUBLIC guint8 *
get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UCS-2 encoded string
* containing characters from the Basic Multilingual Plane (plane 0) of
* Unicode, and return a pointer to a UTF-8 string, allocated with the
* wmem scope.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes.
*
* XXX - should map lead and trail surrogate values to REPLACEMENT
* CHARACTERs (0xFFFD)?
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
WS_DLL_PUBLIC guint8 *
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UTF-16 encoded string, and
* return a pointer to a UTF-8 string, allocated with the wmem scope.
*
* See RFC 2781 section 2.2.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
*
* Specify length in bytes.
*
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if there are an odd number of bytes, should put a
* REPLACEMENT CHARACTER at the end.
*/
WS_DLL_PUBLIC guint8 *
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UCS-4 encoded string, and
* return a pointer to a UTF-8 string, allocated with the wmem scope.
*
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
*
* Specify length in bytes
*
* XXX - should map lead and trail surrogate values to a "substitute"
* UTF-8 character?
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
* XXX - if the number of bytes isn't a multiple of 4, should put a
* REPLACEMENT CHARACTER at the end.
*/
WS_DLL_PUBLIC guint8 *
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
@ -93,6 +173,14 @@ WS_DLL_PUBLIC guint8 *
get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
const gint bit_offset, gint no_of_chars);
/*
* Given a wmem scope, a pointer, a length, and a translation table with
* 256 entries, treat the string of bytes referred to by the pointer and
* length as a string encoded using one octet per character, with octets
* being mapped by the translation table to 2-byte Unicode Basic Multilingual
* Plane characters (including REPLACEMENT CHARACTER), and return a
* pointer to a UTF-8 string, allocated using the wmem scope.
*/
WS_DLL_PUBLIC guint8 *
get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]);

View File

@ -455,6 +455,7 @@ void proto_report_dissector_bug(const char *format, ...)
*/
#define ENC_CHARENCODING_MASK 0x3FFFFFFE /* mask out byte-order bits and Zigbee bits */
#define ENC_ASCII 0x00000000
#define ENC_ISO_646_IRV ENC_ASCII /* ISO 646 International Reference Version = ASCII */
#define ENC_UTF_8 0x00000002
#define ENC_UTF_16 0x00000004
#define ENC_UCS_2 0x00000006
@ -487,6 +488,7 @@ void proto_report_dissector_bug(const char *format, ...)
#define ENC_WINDOWS_1251 0x0000003C
#define ENC_CP855 0x0000003E
#define ENC_CP866 0x00000040
#define ENC_ISO_646_BASIC 0x00000042
/*
* TODO:
*

View File

@ -2502,6 +2502,28 @@ tvb_get_ascii_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint l
return get_ascii_string(scope, ptr, length);
}
/*
* Given a wmem scope, a tvbuff, an offset, a length, and a translation table,
* treat the string of bytes referred to by the tvbuff, offset, and length
* as a string encoded using one octet per character, with octets with the
* high-order bit clear being mapped by the translation table to 2-byte
* Unicode Basic Multilingual Plane characters (including REPLACEMENT
* CHARACTER) and octets with the high-order bit set being mapped to
* REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
* allocated using the wmem scope.
*
* Octets with the highest bit set will be converted to the Unicode
* REPLACEMENT CHARACTER.
*/
static guint8 *
tvb_get_iso_646_string(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint length, const gunichar2 table[0x80])
{
const guint8 *ptr;
ptr = ensure_contiguous(tvb, offset, length);
return get_iso_646_string(scope, ptr, length, table);
}
/*
* Given a wmem scope, a tvbuff, an offset, and a length, treat the string
* of bytes referred to by the tvbuff, the offset. and the length as a UTF-8
@ -2870,6 +2892,10 @@ tvb_get_string_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset,
strptr = tvb_get_string_unichar2(scope, tvb, offset, length, charset_table_cp866);
break;
case ENC_ISO_646_BASIC:
strptr = tvb_get_iso_646_string(scope, tvb, offset, length, charset_table_iso_646_basic);
break;
case ENC_3GPP_TS_23_038_7BITS:
{
gint bit_offset = offset << 3;
@ -2950,6 +2976,20 @@ tvb_get_ascii_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint
return get_ascii_string(scope, ptr, size);
}
static guint8 *
tvb_get_iso_646_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, gint offset, gint *lengthp, const gunichar2 table[0x80])
{
guint size;
const guint8 *ptr;
size = tvb_strsize(tvb, offset);
ptr = ensure_contiguous(tvb, offset, size);
/* XXX, conversion between signed/unsigned integer */
if (lengthp)
*lengthp = size;
return get_iso_646_string(scope, ptr, size, table);
}
static guint8 *
tvb_get_utf_8_stringz(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, gint *lengthp)
{
@ -3236,6 +3276,10 @@ tvb_get_stringz_enc(wmem_allocator_t *scope, tvbuff_t *tvb, const gint offset, g
strptr = tvb_get_stringz_unichar2(scope, tvb, offset, lengthp, charset_table_cp866);
break;
case ENC_ISO_646_BASIC:
strptr = tvb_get_iso_646_stringz(scope, tvb, offset, lengthp, charset_table_iso_646_basic);
break;
case ENC_3GPP_TS_23_038_7BITS:
REPORT_DISSECTOR_BUG("TS 23.038 7bits has no null character and doesn't support null-terminated strings");
break;