Move get_utf_8_string() to wsutil

This commit is contained in:
João Valverde 2023-02-06 04:27:05 +00:00
parent 9b797e97a2
commit 9feb85ce4d
3 changed files with 198 additions and 182 deletions

View File

@ -99,191 +99,10 @@ get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
return (guint8 *) wmem_strbuf_finalize(str);
}
/* Given a pointer and a length, validates a string of bytes as UTF-8.
* Returns the number of valid bytes, and a pointer immediately past
* the checked region.
*
* Differs from Glib's g_utf8_validate_len in that null bytes are
* considered valid UTF-8, and that maximal subparts are replaced as
* a unit. (I.e., given a sequence of 2 or 3 bytes which are a
* truncated version of a 3 or 4 byte UTF-8 character, but the next
* byte does not continue the character, the set of 2 or 3 bytes
* are replaced with one REPLACMENT CHARACTER.)
*/
static inline size_t
utf_8_validate(const guint8 *start, gint length, const guint8 **end)
{
const guint8 *ptr = start;
guint8 ch;
size_t unichar_len, valid_bytes = 0;
while (length > 0) {
ch = *ptr;
if (ch < 0x80) {
valid_bytes++;
ptr++;
length--;
continue;
}
ch = *ptr;
if (ch < 0xc2 || ch > 0xf4) {
ptr++;
length--;
*end = ptr;
return valid_bytes;
}
if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
unichar_len = 2;
} else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
unichar_len = 3;
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
switch (ch) {
case 0xe0:
if (*ptr < 0xa0 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
break;
case 0xed:
if (*ptr < 0x80 || *ptr > 0x9f) {
*end = ptr;
return valid_bytes;
}
break;
default:
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
}
} else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
unichar_len = 4;
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
switch (ch) {
case 0xf0:
if (*ptr < 0x90 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
break;
case 0xf4:
if (*ptr < 0x80 || *ptr > 0x8f) {
*end = ptr;
return valid_bytes;
}
break;
default:
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
}
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
}
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
} else {
ptr++;
length--;
valid_bytes += unichar_len;
}
}
*end = ptr;
return valid_bytes;
}
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UTF-8 string, and return a
* pointer to a UTF-8 string, allocated using the wmem scope, with all
* ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
* according to the recommended "best practices" given in the Unicode
* Standard and specified by W3C/WHATWG.
*
* Note that in conformance with the Unicode Standard, this treats three
* byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
* and two byte overlong encodings of 7-bit ASCII characters as invalid and
* substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
* derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
* be added later.
*
* Compared with g_utf8_make_valid(), this function does not consider
* internal NUL bytes as invalid and replace them with replacment characters.
* It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3
* bytes which are a truncated version of a valid 3 or 4 byte character (but
* the next byte does not continue the character) are replaced with a single
* REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the
* sequence with its own (3 octet) REPLACEMENT CHARACTER.
*
* XXX: length should probably be a size_t instead of a gint in all
* these encoding functions
* XXX: the buffer returned can be of different length than the input,
* and can have internal NULs as well (so that strlen doesn't give its
* length). As with the other encoding functions, we should return the
* length of the output buffer (or a wmem_strbuf_t directly) and an
* indication of whether there was an invalid character (i.e.
* REPLACEMENT CHARACTER was used.)
*/
guint8 *
get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
{
wmem_strbuf_t *str;
str = wmem_strbuf_new_sized(scope, length+1);
/* See the Unicode Standard conformance chapter at
* https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially
* Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
* U+FFFD Substitution of Maximal Subparts. */
while (length > 0) {
const guint8 *prev = ptr;
size_t valid_bytes = utf_8_validate(prev, length, &ptr);
if (valid_bytes) {
wmem_strbuf_append_len(str, prev, valid_bytes);
}
length -= (gint)(ptr - prev);
prev += valid_bytes;
if (ptr - prev) {
wmem_strbuf_append_unichar_repl(str);
}
}
return (guint8 *) wmem_strbuf_finalize(str);
return ws_utf8_make_valid(scope, ptr, length);
}
/*

View File

@ -31,6 +31,193 @@ int ws_utf8_seqlen[256] = {
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xf0...0xff */
};
/* Given a pointer and a length, validates a string of bytes as UTF-8.
* Returns the number of valid bytes, and a pointer immediately past
* the checked region.
*
* Differs from Glib's g_utf8_validate_len in that null bytes are
* considered valid UTF-8, and that maximal subparts are replaced as
* a unit. (I.e., given a sequence of 2 or 3 bytes which are a
* truncated version of a 3 or 4 byte UTF-8 character, but the next
* byte does not continue the character, the set of 2 or 3 bytes
* are replaced with one REPLACMENT CHARACTER.)
*/
static inline size_t
utf_8_validate(const guint8 *start, ssize_t length, const guint8 **end)
{
const guint8 *ptr = start;
guint8 ch;
size_t unichar_len, valid_bytes = 0;
while (length > 0) {
ch = *ptr;
if (ch < 0x80) {
valid_bytes++;
ptr++;
length--;
continue;
}
ch = *ptr;
if (ch < 0xc2 || ch > 0xf4) {
ptr++;
length--;
*end = ptr;
return valid_bytes;
}
if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
unichar_len = 2;
} else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
unichar_len = 3;
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
switch (ch) {
case 0xe0:
if (*ptr < 0xa0 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
break;
case 0xed:
if (*ptr < 0x80 || *ptr > 0x9f) {
*end = ptr;
return valid_bytes;
}
break;
default:
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
}
} else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
unichar_len = 4;
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
switch (ch) {
case 0xf0:
if (*ptr < 0x90 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
break;
case 0xf4:
if (*ptr < 0x80 || *ptr > 0x8f) {
*end = ptr;
return valid_bytes;
}
break;
default:
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
}
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
}
}
ptr++;
length--;
if (length < 1) {
*end = ptr;
return valid_bytes;
}
if (*ptr < 0x80 || *ptr > 0xbf) {
*end = ptr;
return valid_bytes;
} else {
ptr++;
length--;
valid_bytes += unichar_len;
}
}
*end = ptr;
return valid_bytes;
}
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UTF-8 string, and return a
* pointer to a UTF-8 string, allocated using the wmem scope, with all
* ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
* according to the recommended "best practices" given in the Unicode
* Standard and specified by W3C/WHATWG.
*
* Note that in conformance with the Unicode Standard, this treats three
* byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
* and two byte overlong encodings of 7-bit ASCII characters as invalid and
* substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
* derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
* be added later.
*
* Compared with g_utf8_make_valid(), this function does not consider
* internal NUL bytes as invalid and replace them with replacment characters.
* It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3
* bytes which are a truncated version of a valid 3 or 4 byte character (but
* the next byte does not continue the character) are replaced with a single
* REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the
* sequence with its own (3 octet) REPLACEMENT CHARACTER.
*
* XXX: length should probably be a size_t instead of a gint in all
* these encoding functions
* XXX: the buffer returned can be of different length than the input,
* and can have internal NULs as well (so that strlen doesn't give its
* length). As with the other encoding functions, we should return the
* length of the output buffer (or a wmem_strbuf_t directly) and an
* indication of whether there was an invalid character (i.e.
* REPLACEMENT CHARACTER was used.)
*/
guint8 *
ws_utf8_make_valid(wmem_allocator_t *scope, const guint8 *ptr, ssize_t length)
{
wmem_strbuf_t *str;
str = wmem_strbuf_new_sized(scope, length+1);
/* See the Unicode Standard conformance chapter at
* https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially
* Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
* U+FFFD Substitution of Maximal Subparts. */
while (length > 0) {
const guint8 *prev = ptr;
size_t valid_bytes = utf_8_validate(prev, length, &ptr);
if (valid_bytes) {
wmem_strbuf_append_len(str, prev, valid_bytes);
}
length -= ptr - prev;
prev += valid_bytes;
if (ptr - prev) {
wmem_strbuf_append_unichar_repl(str);
}
}
return (guint8 *) wmem_strbuf_finalize(str);
}
#ifdef _WIN32
#include <strsafe.h>

View File

@ -59,6 +59,16 @@ int ws_utf8_seqlen[256];
*/
#define ws_utf8_char_len(ch) (ws_utf8_seqlen[(ch)])
/*
* Given a wmem scope, a pointer, and a length, treat the string of bytes
* referred to by the pointer and length as a UTF-8 string, and return a
* pointer to a UTF-8 string, allocated using the wmem scope, with all
* ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
* according to the recommended "best practices" given in the Unicode
* Standard and specified by W3C/WHATWG.
*/
WS_DLL_PUBLIC guint8 *
ws_utf8_make_valid(wmem_allocator_t *scope, const guint8 *ptr, ssize_t length);
#ifdef _WIN32