2021-11-26 03:31:05 +00:00
|
|
|
/** @file
|
2004-09-10 22:59:37 +00:00
|
|
|
* Routines for handling character sets
|
|
|
|
*
|
2006-05-21 05:12:17 +00:00
|
|
|
* Wireshark - Network traffic analyzer
|
|
|
|
* By Gerald Combs <gerald@wireshark.org>
|
2004-09-10 22:59:37 +00:00
|
|
|
* Copyright 1998 Gerald Combs
|
|
|
|
*
|
2018-02-08 16:59:17 +00:00
|
|
|
* SPDX-License-Identifier: GPL-2.0-or-later
|
2004-09-10 22:59:37 +00:00
|
|
|
*/
|
|
|
|
#ifndef __CHARSETS_H__
|
|
|
|
#define __CHARSETS_H__
|
|
|
|
|
2013-03-01 23:53:11 +00:00
|
|
|
#include "ws_symbol_export.h"
|
|
|
|
|
2004-09-10 22:59:37 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif /* __cplusplus */
|
|
|
|
|
2013-12-07 22:45:37 +00:00
|
|
|
/*
|
|
|
|
* Translation tables that map the upper 128 code points in single-byte
|
|
|
|
* "extended ASCII" character encodings to Unicode code points in the
|
|
|
|
* Basic Multilingual Plane.
|
|
|
|
*/
|
|
|
|
|
2013-12-07 10:10:03 +00:00
|
|
|
/* Table for windows-1250 */
|
|
|
|
extern const gunichar2 charset_table_cp1250[0x80];
|
2019-01-03 07:07:02 +00:00
|
|
|
/* Table for windows-1251 */
|
|
|
|
extern const gunichar2 charset_table_cp1251[0x80];
|
2019-05-25 00:39:54 +00:00
|
|
|
/* Table for windows-1252 */
|
|
|
|
extern const gunichar2 charset_table_cp1252[0x80];
|
2013-12-07 10:10:03 +00:00
|
|
|
|
2013-12-07 15:02:55 +00:00
|
|
|
/* Tables for ISO-8859-X */
|
|
|
|
extern const gunichar2 charset_table_iso_8859_2[0x80];
|
2013-12-21 21:55:46 +00:00
|
|
|
extern const gunichar2 charset_table_iso_8859_3[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_4[0x80];
|
2013-12-15 19:13:31 +00:00
|
|
|
extern const gunichar2 charset_table_iso_8859_5[0x80];
|
2013-12-21 21:55:46 +00:00
|
|
|
extern const gunichar2 charset_table_iso_8859_6[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_7[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_8[0x80];
|
2013-12-18 23:32:06 +00:00
|
|
|
extern const gunichar2 charset_table_iso_8859_9[0x80];
|
2013-12-21 21:55:46 +00:00
|
|
|
extern const gunichar2 charset_table_iso_8859_10[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_11[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_13[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_14[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_15[0x80];
|
|
|
|
extern const gunichar2 charset_table_iso_8859_16[0x80];
|
2013-12-07 15:02:55 +00:00
|
|
|
|
2014-04-12 08:53:33 +00:00
|
|
|
/* Tables for Mac character sets */
|
|
|
|
extern const gunichar2 charset_table_mac_roman[0x80];
|
|
|
|
|
|
|
|
/* Tables for DOS code pages */
|
|
|
|
extern const gunichar2 charset_table_cp437[0x80];
|
2019-01-03 07:07:02 +00:00
|
|
|
extern const gunichar2 charset_table_cp855[0x80];
|
|
|
|
extern const gunichar2 charset_table_cp866[0x80];
|
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Translation tables that map the lower 128 code points in single-byte
|
|
|
|
* ISO 646-based character encodings to Unicode code points in the
|
|
|
|
* Basic Multilingual Plane.
|
|
|
|
*/
|
|
|
|
extern const gunichar2 charset_table_iso_646_basic[0x80];
|
2014-04-12 08:53:33 +00:00
|
|
|
|
2016-12-12 05:49:14 +00:00
|
|
|
/* Tables for EBCDIC code pages */
|
|
|
|
extern const gunichar2 charset_table_ebcdic[256];
|
|
|
|
extern const gunichar2 charset_table_ebcdic_cp037[256];
|
|
|
|
|
2014-04-25 08:31:08 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
|
|
|
* referred to by the pointer and length as an ASCII string, with all bytes
|
|
|
|
* with the high-order bit set being invalid, and return a pointer to a
|
|
|
|
* UTF-8 string, allocated using the wmem scope.
|
|
|
|
*
|
|
|
|
* Octets with the highest bit set will be converted to the Unicode
|
|
|
|
* REPLACEMENT CHARACTER.
|
|
|
|
*/
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
|
|
|
|
|
2020-09-17 19:27:26 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
|
|
|
* referred to by the pointer and length as a UTF-8 string, and return a
|
|
|
|
* pointer to a UTF-8 string, allocated using the wmem scope, with all
|
|
|
|
* ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
|
|
|
|
* according to the recommended "best practices" given in the Unicode
|
|
|
|
* Standard and specified by W3C/WHATWG.
|
|
|
|
*/
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
|
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, a length, and a translation table,
|
|
|
|
* treat the string of bytes referred to by the pointer and length as a
|
|
|
|
* string encoded using one octet per character, with octets with the
|
|
|
|
* high-order bit clear being mapped by the translation table to 2-byte
|
|
|
|
* Unicode Basic Multilingual Plane characters (including REPLACEMENT
|
|
|
|
* CHARACTER) and octets with the high-order bit set being mapped to
|
|
|
|
* REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
|
|
|
|
* allocated using the wmem scope.
|
|
|
|
*/
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
|
|
|
* referred to by the pointer and length as an ISO 8859/1 string, and
|
|
|
|
* return a pointer to a UTF-8 string, allocated using the wmem scope.
|
|
|
|
*/
|
2014-04-25 08:31:08 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
|
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, a length, and a translation table with
|
|
|
|
* 128 entries, treat the string of bytes referred to by the pointer and
|
|
|
|
* length as a string encoded using one octet per character, with octets
|
|
|
|
* with the high-order bit clear being ASCII and octets with the high-order
|
|
|
|
* bit set being mapped by the translation table to 2-byte Unicode Basic
|
|
|
|
* Multilingual Plane characters (including REPLACEMENT CHARACTER), and
|
|
|
|
* return a pointer to a UTF-8 string, allocated using the wmem scope.
|
|
|
|
*/
|
2014-04-25 08:31:08 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
|
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
|
|
|
* referred to by the pointer and length as a UCS-2 encoded string
|
|
|
|
* containing characters from the Basic Multilingual Plane (plane 0) of
|
|
|
|
* Unicode, and return a pointer to a UTF-8 string, allocated with the
|
|
|
|
* wmem scope.
|
|
|
|
*
|
|
|
|
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
|
|
|
*
|
|
|
|
* Specify length in bytes.
|
|
|
|
*
|
|
|
|
* XXX - should map lead and trail surrogate values to REPLACEMENT
|
|
|
|
* CHARACTERs (0xFFFD)?
|
|
|
|
* XXX - if there are an odd number of bytes, should put a
|
|
|
|
* REPLACEMENT CHARACTER at the end.
|
|
|
|
*/
|
2014-04-25 08:31:08 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
|
|
|
* referred to by the pointer and length as a UTF-16 encoded string, and
|
|
|
|
* return a pointer to a UTF-8 string, allocated with the wmem scope.
|
|
|
|
*
|
|
|
|
* See RFC 2781 section 2.2.
|
|
|
|
*
|
|
|
|
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
|
|
|
|
*
|
|
|
|
* Specify length in bytes.
|
|
|
|
*
|
|
|
|
* XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
|
|
|
|
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
|
|
|
|
* XXX - if there are an odd number of bytes, should put a
|
|
|
|
* REPLACEMENT CHARACTER at the end.
|
|
|
|
*/
|
2014-04-25 08:31:08 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the string of bytes
|
|
|
|
* referred to by the pointer and length as a UCS-4 encoded string, and
|
|
|
|
* return a pointer to a UTF-8 string, allocated with the wmem scope.
|
|
|
|
*
|
|
|
|
* Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
|
|
|
|
*
|
|
|
|
* Specify length in bytes
|
|
|
|
*
|
|
|
|
* XXX - should map lead and trail surrogate values to a "substitute"
|
|
|
|
* UTF-8 character?
|
|
|
|
* XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
|
|
|
|
* XXX - if the number of bytes isn't a multiple of 4, should put a
|
|
|
|
* REPLACEMENT CHARACTER at the end.
|
|
|
|
*/
|
2014-04-25 08:31:08 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
|
|
|
|
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
2020-09-28 20:16:17 +00:00
|
|
|
get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
|
2014-08-05 20:10:44 +00:00
|
|
|
const gint bit_offset, gint no_of_chars);
|
2014-04-25 08:43:24 +00:00
|
|
|
|
2020-09-28 20:16:17 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const guint8 *ptr,
|
|
|
|
gint length);
|
|
|
|
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
|
|
|
|
gint length);
|
|
|
|
|
2014-04-25 08:31:08 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
|
2014-08-05 20:10:44 +00:00
|
|
|
const gint bit_offset, gint no_of_chars);
|
2014-04-25 08:31:08 +00:00
|
|
|
|
2019-07-15 03:18:14 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, a length, and a translation table with
|
|
|
|
* 256 entries, treat the string of bytes referred to by the pointer and
|
|
|
|
* length as a string encoded using one octet per character, with octets
|
|
|
|
* being mapped by the translation table to 2-byte Unicode Basic Multilingual
|
|
|
|
* Plane characters (including REPLACEMENT CHARACTER), and return a
|
|
|
|
* pointer to a UTF-8 string, allocated using the wmem scope.
|
|
|
|
*/
|
2014-04-25 09:29:42 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
2016-12-12 08:19:44 +00:00
|
|
|
get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]);
|
2014-04-25 09:29:42 +00:00
|
|
|
|
2020-10-18 23:28:01 +00:00
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the bytes referred to
|
|
|
|
* by the pointer and length as a GB18030 encoded string, and return a pointer
|
|
|
|
* to a UTF-8 string, allocated using the wmem scope, converted having
|
|
|
|
* substituted REPLACEMENT CHARACTER according to the Unicode Standard
|
|
|
|
* 5.22 U+FFFD Substitution for Conversion.
|
|
|
|
* ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
|
|
|
|
*
|
|
|
|
* As expected, this will also decode GBK and GB2312 strings.
|
|
|
|
*/
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Given a wmem scope, a pointer, and a length, treat the bytes referred to
|
|
|
|
* by the pointer and length as a EUC-KR encoded string, and return a pointer
|
|
|
|
* to a UTF-8 string, allocated using the wmem scope, converted having
|
|
|
|
* substituted REPLACEMENT CHARACTER according to the Unicode Standard
|
|
|
|
* 5.22 U+FFFD Substitution for Conversion.
|
|
|
|
* ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
|
|
|
|
*/
|
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
|
|
|
|
|
2016-10-21 08:10:06 +00:00
|
|
|
WS_DLL_PUBLIC guint8 *
|
|
|
|
get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
|
|
|
|
|
2014-04-25 09:29:42 +00:00
|
|
|
#if 0
|
|
|
|
void ASCII_to_EBCDIC(guint8 *buf, guint bytes);
|
|
|
|
guint8 ASCII_to_EBCDIC1(guint8 c);
|
|
|
|
#endif
|
|
|
|
WS_DLL_PUBLIC
|
|
|
|
void EBCDIC_to_ASCII(guint8 *buf, guint bytes);
|
|
|
|
WS_DLL_PUBLIC
|
|
|
|
guint8 EBCDIC_to_ASCII1(guint8 c);
|
|
|
|
|
2004-09-10 22:59:37 +00:00
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif /* __cplusplus */
|
|
|
|
|
|
|
|
#endif /* __CHARSETS_H__ */
|
2013-12-09 20:58:57 +00:00
|
|
|
|
|
|
|
/*
|
2019-07-26 18:43:17 +00:00
|
|
|
* Editor modelines - https://www.wireshark.org/tools/modelines.html
|
2013-12-09 20:58:57 +00:00
|
|
|
*
|
|
|
|
* Local variables:
|
|
|
|
* c-basic-offset: 4
|
|
|
|
* tab-width: 8
|
|
|
|
* indent-tabs-mode: nil
|
|
|
|
* End:
|
|
|
|
*
|
|
|
|
* vi: set shiftwidth=4 tabstop=8 expandtab:
|
|
|
|
* :indentSize=4:tabSize=8:noTabs=true:
|
|
|
|
*/
|