wsutil: Rewrite ws_utf8_char_len() using a lookup table

Rewrite for speed and correctness.

This implementation is more strict with invalid
first bytes (continuation bytes, invalid codepoints and
some overlong sequences).

Returns 0 instead of -1 for invalid bytes.
This commit is contained in:
João Valverde 2022-10-14 10:25:00 +01:00
parent eea68c7721
commit 3de62e588f
2 changed files with 28 additions and 14 deletions

View File

@ -12,18 +12,24 @@
#include "unicode-utils.h"
int
ws_utf8_char_len(guint8 ch)
{
if (ch >= 0xfe) return -1;
if (ch >= 0xfc) return 6;
if (ch >= 0xf8) return 5;
if (ch >= 0xf0) return 4;
if (ch >= 0xe0) return 3;
if (ch >= 0xc0) return 2;
else return 1;
}
int ws_utf8_seqlen[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00...0x0f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10...0x1f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20...0x2f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30...0x3f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40...0x4f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50...0x5f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60...0x6f */
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70...0x7f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80...0x8f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90...0x9f */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xa0...0xaf */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xb0...0xbf */
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xc0...0xcf */
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xd0...0xdf */
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xe0...0xef */
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xf0...0xff */
};
#ifdef _WIN32

View File

@ -59,8 +59,16 @@ extern "C" {
} while (0)
WS_DLL_PUBLIC
int ws_utf8_char_len(guint8 ch);
WSUTIL_EXPORT
int ws_utf8_seqlen[256];
/** Given the first byte in an UTF-8 encoded code point,
* return the length of the multibyte sequence, or *ZERO*
* if the byte is invalid as the first byte in a multibyte
* sequence.
*/
#define ws_utf8_char_len(ch) (ws_utf8_seqlen[(ch)])
#ifdef _WIN32