forked from osmocom/wireshark
wsutil: Rewrite ws_utf8_char_len() using a lookup table
Rewrite for speed and correctness. This implementation is more strict with invalid first bytes (continuation bytes, invalid codepoints and some overlong sequences). Returns 0 instead of -1 for invalid bytes.
This commit is contained in:
parent
eea68c7721
commit
3de62e588f
|
@ -12,18 +12,24 @@
|
|||
|
||||
#include "unicode-utils.h"
|
||||
|
||||
int
|
||||
ws_utf8_char_len(guint8 ch)
|
||||
{
|
||||
if (ch >= 0xfe) return -1;
|
||||
if (ch >= 0xfc) return 6;
|
||||
if (ch >= 0xf8) return 5;
|
||||
if (ch >= 0xf0) return 4;
|
||||
if (ch >= 0xe0) return 3;
|
||||
if (ch >= 0xc0) return 2;
|
||||
else return 1;
|
||||
}
|
||||
|
||||
int ws_utf8_seqlen[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00...0x0f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10...0x1f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20...0x2f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30...0x3f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40...0x4f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50...0x5f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60...0x6f */
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70...0x7f */
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80...0x8f */
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90...0x9f */
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0...0xaf */
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0...0xbf */
|
||||
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xc0...0xcf */
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xd0...0xdf */
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xe0...0xef */
|
||||
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xf0...0xff */
|
||||
};
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
|
|
|
@ -59,8 +59,16 @@ extern "C" {
|
|||
} while (0)
|
||||
|
||||
|
||||
WS_DLL_PUBLIC
|
||||
int ws_utf8_char_len(guint8 ch);
|
||||
WSUTIL_EXPORT
|
||||
int ws_utf8_seqlen[256];
|
||||
|
||||
/** Given the first byte in an UTF-8 encoded code point,
|
||||
* return the length of the multibyte sequence, or *ZERO*
|
||||
* if the byte is invalid as the first byte in a multibyte
|
||||
* sequence.
|
||||
*/
|
||||
#define ws_utf8_char_len(ch) (ws_utf8_seqlen[(ch)])
|
||||
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
|
|
Loading…
Reference in New Issue