Added class to handle Unicode characters and their UTF-8 conversions.
Made String handling of UTF-8 and Unicode easier. git-svn-id: http://voip.null.ro/svn/yate@5507 acf43c95-373e-0410-b603-e72c3f656dc1
This commit is contained in:
parent
fef8255f07
commit
5c91367b63
|
@ -139,6 +139,122 @@ static inline char hexEncode(char nib)
|
|||
}
|
||||
|
||||
|
||||
void UChar::encode()
|
||||
{
|
||||
if (m_chr < 0x80) {
|
||||
m_str[0] = (char)m_chr;
|
||||
m_str[1] = '\0';
|
||||
}
|
||||
else if (m_chr < 0x800) {
|
||||
m_str[0] = (char)(0xc0 | ((m_chr >> 6) & 0x1f));
|
||||
m_str[1] = (char)(0x80 | (m_chr & 0x3f));
|
||||
m_str[2] = '\0';
|
||||
}
|
||||
else if (m_chr < 0xffff) {
|
||||
m_str[0] = (char)(0xe0 | ((m_chr >> 12) & 0x0f));
|
||||
m_str[1] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
|
||||
m_str[2] = (char)(0x80 | (m_chr & 0x3f));
|
||||
m_str[3] = '\0';
|
||||
}
|
||||
else if (m_chr < 0x1fffff) {
|
||||
m_str[0] = (char)(0xf0 | ((m_chr >> 18) & 0x07));
|
||||
m_str[1] = (char)(0x80 | ((m_chr >> 12) & 0x3f));
|
||||
m_str[2] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
|
||||
m_str[3] = (char)(0x80 | (m_chr & 0x3f));
|
||||
m_str[4] = '\0';
|
||||
}
|
||||
else if (m_chr < 0x3ffffff) {
|
||||
m_str[0] = (char)(0xf8 | ((m_chr >> 24) & 0x03));
|
||||
m_str[1] = (char)(0x80 | ((m_chr >> 18) & 0x3f));
|
||||
m_str[2] = (char)(0x80 | ((m_chr >> 12) & 0x3f));
|
||||
m_str[3] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
|
||||
m_str[4] = (char)(0x80 | (m_chr & 0x3f));
|
||||
m_str[5] = '\0';
|
||||
}
|
||||
else if (m_chr < 0x7fffffff) {
|
||||
m_str[0] = (char)(0xfc | ((m_chr >> 30) & 0x01));
|
||||
m_str[1] = (char)(0x80 | ((m_chr >> 24) & 0x3f));
|
||||
m_str[2] = (char)(0x80 | ((m_chr >> 18) & 0x3f));
|
||||
m_str[3] = (char)(0x80 | ((m_chr >> 12) & 0x3f));
|
||||
m_str[4] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
|
||||
m_str[5] = (char)(0x80 | (m_chr & 0x3f));
|
||||
m_str[6] = '\0';
|
||||
}
|
||||
else
|
||||
m_str[0] = '\0';
|
||||
}
|
||||
|
||||
bool UChar::decode(const char*& str, unsigned int maxChar, bool overlong)
|
||||
{
|
||||
operator=('\0');
|
||||
if (!str)
|
||||
return false;
|
||||
if (maxChar < 128)
|
||||
maxChar = 0x10ffff; // RFC 3629 default limit
|
||||
|
||||
unsigned int more = 0;
|
||||
u_int32_t min = 0;
|
||||
u_int32_t val = 0;
|
||||
|
||||
unsigned char c = (unsigned char)*str++;
|
||||
// from 1st byte we find out how many are supposed to follow
|
||||
if (!c) // don't advance past NUL
|
||||
--str;
|
||||
else if (c < 0x80) // 1 byte, 0...0x7F, ASCII characters
|
||||
val = c & 0x7f;
|
||||
else if (c < 0xc0) // invalid as first UFT-8 byte
|
||||
return false;
|
||||
else if (c < 0xe0) {
|
||||
// 2 bytes, 0x80...0x7FF
|
||||
min = 0x80;
|
||||
val = c & 0x1f;
|
||||
more = 1;
|
||||
}
|
||||
else if (c < 0xf0) {
|
||||
// 3 bytes, 0x800...0xFFFF, Basic Multilingual Plane
|
||||
min = 0x800;
|
||||
val = c & 0x0f;
|
||||
more = 2;
|
||||
}
|
||||
else if (c < 0xf8) {
|
||||
// 4 bytes, 0x10000...0x1FFFFF, RFC 3629 limit (10FFFF)
|
||||
min = 0x10000;
|
||||
val = c & 0x07;
|
||||
more = 3;
|
||||
}
|
||||
else if (c < 0xfc) {
|
||||
// 5 bytes, 0x200000...0x3FFFFFF
|
||||
min = 0x200000;
|
||||
val = c & 0x03;
|
||||
more = 4;
|
||||
}
|
||||
else if (c < 0xfe) {
|
||||
// 6 bytes, 0x4000000...0x7FFFFFFF
|
||||
min = 0x4000000;
|
||||
val = c & 0x01;
|
||||
more = 5;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
|
||||
while (more--) {
|
||||
c = (unsigned char)*str;
|
||||
// all continuation bytes are in range [128..191]
|
||||
if ((c & 0xc0) != 0x80)
|
||||
return false;
|
||||
val = (val << 6) | (c & 0x3f);
|
||||
++str;
|
||||
}
|
||||
operator=(val);
|
||||
// got full value, check for overlongs and out of range
|
||||
if (val > maxChar)
|
||||
return false;
|
||||
if (val < min && !overlong)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
StringMatchPrivate::StringMatchPrivate()
|
||||
{
|
||||
XDebug(DebugAll,"StringMatchPrivate::StringMatchPrivate() [%p]",this);
|
||||
|
@ -649,6 +765,14 @@ String& String::operator>>(char& store)
|
|||
return *this;
|
||||
}
|
||||
|
||||
String& String::operator>>(UChar& store)
|
||||
{
|
||||
const char* str = m_string;
|
||||
store.decode(str);
|
||||
assign(str);
|
||||
return *this;
|
||||
}
|
||||
|
||||
String& String::operator>>(int& store)
|
||||
{
|
||||
if (m_string) {
|
||||
|
@ -1166,17 +1290,17 @@ unsigned int String::hash(const char* value)
|
|||
return h;
|
||||
}
|
||||
|
||||
int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
|
||||
int String::lenUtf8(const char* value, unsigned int maxChar, bool overlong)
|
||||
{
|
||||
if (!value)
|
||||
return 0;
|
||||
if (maxSeq < 1)
|
||||
maxSeq = 4; // RFC 3629 default limit
|
||||
if (maxChar < 128)
|
||||
maxChar = 0x10ffff; // RFC 3629 default limit
|
||||
|
||||
int count = 0;
|
||||
unsigned int more = 0;
|
||||
int32_t min = 0;
|
||||
int32_t val = 0;
|
||||
u_int32_t min = 0;
|
||||
u_int32_t val = 0;
|
||||
|
||||
while (unsigned char c = (unsigned char) *value++) {
|
||||
if (more) {
|
||||
|
@ -1185,9 +1309,11 @@ int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
|
|||
return -1;
|
||||
val = (val << 6) | (c & 0x3f);
|
||||
if (!--more) {
|
||||
// got full value, check for overlongs and out of range
|
||||
if (val > maxChar)
|
||||
return -1;
|
||||
if (overlong)
|
||||
continue;
|
||||
// got full value, check for overlongs
|
||||
if (val < min)
|
||||
return -1;
|
||||
}
|
||||
|
@ -1195,35 +1321,35 @@ int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
|
|||
}
|
||||
count++;
|
||||
// from 1st byte we find out how many are supposed to follow
|
||||
if (c < 128) // 1 byte, 0...0x7F, ASCII characters, no check
|
||||
if (c < 0x80) // 1 byte, 0...0x7F, ASCII characters, no check
|
||||
;
|
||||
else if (c < 192) // invalid as first UFT-8 byte
|
||||
else if (c < 0xc0) // invalid as first UFT-8 byte
|
||||
return -1;
|
||||
else if (c < 224) {
|
||||
else if (c < 0xe0) {
|
||||
// 2 bytes, 0x80...0x7FF
|
||||
min = 0x80;
|
||||
val = c & 0x1f;
|
||||
more = 1;
|
||||
}
|
||||
else if (c < 240) {
|
||||
else if (c < 0xf0) {
|
||||
// 3 bytes, 0x800...0xFFFF, Basic Multilingual Plane
|
||||
min = 0x800;
|
||||
val = c & 0x0f;
|
||||
more = 2;
|
||||
}
|
||||
else if (c < 248) {
|
||||
else if (c < 0xf8) {
|
||||
// 4 bytes, 0x10000...0x1FFFFF, RFC 3629 limit (10FFFF)
|
||||
min = 0x10000;
|
||||
val = c & 0x07;
|
||||
more = 3;
|
||||
}
|
||||
else if (c < 252) {
|
||||
else if (c < 0xfc) {
|
||||
// 5 bytes, 0x200000...0x3FFFFFF
|
||||
min = 0x200000;
|
||||
val = c & 0x03;
|
||||
more = 4;
|
||||
}
|
||||
else if (c < 254) {
|
||||
else if (c < 0xfe) {
|
||||
// 6 bytes, 0x4000000...0x7FFFFFFF
|
||||
min = 0x4000000;
|
||||
val = c & 0x01;
|
||||
|
@ -1231,28 +1357,25 @@ int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
|
|||
}
|
||||
else
|
||||
return -1;
|
||||
// check if we accept a character with such sequence length
|
||||
if (more >= maxSeq)
|
||||
return -1;
|
||||
}
|
||||
if (more)
|
||||
return -1;
|
||||
return count;
|
||||
}
|
||||
|
||||
int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
|
||||
int String::fixUtf8(const char* replace, unsigned int maxChar, bool overlong)
|
||||
{
|
||||
if (null())
|
||||
return 0;
|
||||
if (maxSeq < 1)
|
||||
maxSeq = 4; // RFC 3629 default limit
|
||||
if (maxChar < 128)
|
||||
maxChar = 0x10ffff; // RFC 3629 default limit
|
||||
if (!replace)
|
||||
replace = "\xEF\xBF\xBD";
|
||||
|
||||
int count = 0;
|
||||
unsigned int more = 0;
|
||||
int32_t min = 0;
|
||||
int32_t val = 0;
|
||||
u_int32_t min = 0;
|
||||
u_int32_t val = 0;
|
||||
unsigned int pos = 0;
|
||||
bool bad = false;
|
||||
String tmp;
|
||||
|
@ -1260,9 +1383,6 @@ int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
|
|||
for (unsigned int i = 0; i < m_length; i++) {
|
||||
unsigned char c = (unsigned char) at(i);
|
||||
if (more) {
|
||||
// remember to reject a character with a too long sequence
|
||||
if (more >= maxSeq)
|
||||
bad = true;
|
||||
// all continuation bytes are in range [128..191]
|
||||
if ((c & 0xc0) != 0x80) {
|
||||
// truncated sequence, must search for 1st byte again
|
||||
|
@ -1273,8 +1393,8 @@ int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
|
|||
else {
|
||||
val = (val << 6) | (c & 0x3f);
|
||||
if (!--more) {
|
||||
// got full value, check for overlongs
|
||||
if ((val < min) && !overlong)
|
||||
// got full value, check for overlongs and out of range
|
||||
if ((val > maxChar) || ((val < min) && !overlong))
|
||||
bad = true;
|
||||
// finished multibyte, add it to temporary
|
||||
if (bad) {
|
||||
|
@ -1290,35 +1410,35 @@ int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
|
|||
pos = i;
|
||||
bad = false;
|
||||
// from 1st byte we find out how many are supposed to follow
|
||||
if (c < 128) // 1 byte, 0...0x7F, ASCII characters, good
|
||||
if (c < 0x80) // 1 byte, 0...0x7F, ASCII characters, good
|
||||
;
|
||||
else if (c < 192) // invalid as first UFT-8 byte
|
||||
else if (c < 0xc0) // invalid as first UFT-8 byte
|
||||
bad = true;
|
||||
else if (c < 224) {
|
||||
else if (c < 0xe0) {
|
||||
// 2 bytes, 0x80...0x7FF
|
||||
min = 0x80;
|
||||
val = c & 0x1f;
|
||||
more = 1;
|
||||
}
|
||||
else if (c < 240) {
|
||||
else if (c < 0xf0) {
|
||||
// 3 bytes, 0x800...0xFFFF, Basic Multilingual Plane
|
||||
min = 0x800;
|
||||
val = c & 0x0f;
|
||||
more = 2;
|
||||
}
|
||||
else if (c < 248) {
|
||||
else if (c < 0xf8) {
|
||||
// 4 bytes, 0x10000...0x1FFFFF, RFC 3629 limit (10FFFF)
|
||||
min = 0x10000;
|
||||
val = c & 0x07;
|
||||
more = 3;
|
||||
}
|
||||
else if (c < 252) {
|
||||
else if (c < 0xfc) {
|
||||
// 5 bytes, 0x200000...0x3FFFFFF
|
||||
min = 0x200000;
|
||||
val = c & 0x03;
|
||||
more = 4;
|
||||
}
|
||||
else if (c < 254) {
|
||||
else if (c < 0xfe) {
|
||||
// 6 bytes, 0x4000000...0x7FFFFFFF
|
||||
min = 0x4000000;
|
||||
val = c & 0x01;
|
||||
|
|
110
yateclass.h
110
yateclass.h
|
@ -1558,6 +1558,97 @@ private:
|
|||
class Regexp;
|
||||
class StringMatchPrivate;
|
||||
|
||||
/**
|
||||
* A simple class to hold a single Unicode character and convert it to / from UTF-8
|
||||
* @short A single Unicode character
|
||||
*/
|
||||
class YATE_API UChar
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* Constructor from unsigned numeric code
|
||||
* @param code Code of the Unicode character
|
||||
*/
|
||||
inline explicit UChar(unsigned int code = 0)
|
||||
: m_chr(code)
|
||||
{ encode(); }
|
||||
|
||||
/**
|
||||
* Constructor from signed numeric code
|
||||
* @param code Code of the Unicode character
|
||||
*/
|
||||
inline explicit UChar(signed int code)
|
||||
: m_chr((code < 0) ? 0 : code)
|
||||
{ encode(); }
|
||||
|
||||
/**
|
||||
* Constructor from signed character
|
||||
* @param code Character to construct from
|
||||
*/
|
||||
inline explicit UChar(signed char code)
|
||||
: m_chr((unsigned char)code)
|
||||
{ encode(); }
|
||||
|
||||
/**
|
||||
* Constructor from unsigned character
|
||||
* @param code Character to construct from
|
||||
*/
|
||||
inline explicit UChar(unsigned char code)
|
||||
: m_chr(code)
|
||||
{ encode(); }
|
||||
|
||||
/**
|
||||
* Assignment operator from a character code
|
||||
* @param code Character code to assign
|
||||
* @return Reference to this object
|
||||
*/
|
||||
inline UChar& operator=(unsigned int code)
|
||||
{ m_chr = code; encode(); return *this; }
|
||||
|
||||
/**
|
||||
* Assignment operator from a character
|
||||
* @param code Character to assign
|
||||
* @return Reference to this object
|
||||
*/
|
||||
inline UChar& operator=(char code)
|
||||
{ m_chr = (unsigned char)code; encode(); return *this; }
|
||||
|
||||
/**
|
||||
* Get the Unicode value of the character
|
||||
* @return Code of the character as defined by Unicode
|
||||
*/
|
||||
inline unsigned int code() const
|
||||
{ return m_chr; }
|
||||
|
||||
/**
|
||||
* Get the value of the character as UTF-8 string.
|
||||
* @return The character as UTF-8 C string
|
||||
*/
|
||||
inline const char* c_str() const
|
||||
{ return m_str; }
|
||||
|
||||
/**
|
||||
* Conversion to "const char *" operator.
|
||||
* @return Pointer to the internally stored UTF-8 string
|
||||
*/
|
||||
inline operator const char*() const
|
||||
{ return m_str; };
|
||||
|
||||
/**
|
||||
* Decode the first Unicode character from an UTF-8 C string
|
||||
* @param str String to extract from, will be advanced past the character
|
||||
* @param maxChar Maximum accepted Unicode character code
|
||||
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
|
||||
* @return True if an Unicode character was decoded from string
|
||||
*/
|
||||
bool decode(const char*& str, unsigned int maxChar = 0x10ffff, bool overlong = false);
|
||||
|
||||
private:
|
||||
void encode();
|
||||
u_int32_t m_chr;
|
||||
char m_str[8];
|
||||
};
|
||||
|
||||
/**
|
||||
* A simple string handling class for C style (one byte) strings.
|
||||
* For simplicity and read speed no copy-on-write is performed.
|
||||
|
@ -1682,30 +1773,30 @@ public:
|
|||
/**
|
||||
* Get the number of characters in a string assuming UTF-8 encoding
|
||||
* @param value C string to compute Unicode length
|
||||
* @param maxSeq Maximum accepted UTF-8 sequence length
|
||||
* @param maxChar Maximum accepted Unicode character code
|
||||
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
|
||||
* @return Count of Unicode characters, -1 if not valid UTF-8
|
||||
*/
|
||||
static int lenUtf8(const char* value, unsigned int maxSeq = 4, bool overlong = false);
|
||||
static int lenUtf8(const char* value, unsigned int maxChar = 0x10ffff, bool overlong = false);
|
||||
|
||||
/**
|
||||
* Get the number of characters in the string assuming UTF-8 encoding
|
||||
* @param maxSeq Maximum accepted UTF-8 sequence length
|
||||
* @param maxChar Maximum accepted Unicode character code
|
||||
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
|
||||
* @return Count of Unicode characters, -1 if not valid UTF-8
|
||||
*/
|
||||
inline int lenUtf8(unsigned int maxSeq = 4, bool overlong = false) const
|
||||
{ return lenUtf8(m_string,maxSeq,overlong); }
|
||||
inline int lenUtf8(unsigned int maxChar = 0x10ffff, bool overlong = false) const
|
||||
{ return lenUtf8(m_string,maxChar,overlong); }
|
||||
|
||||
|
||||
/**
|
||||
* Fix an UTF-8 encoded string by replacing invalid sequences
|
||||
* @param replace String to replace invalid sequences, use U+FFFD if null
|
||||
* @param maxSeq Maximum accepted UTF-8 sequence length
|
||||
* @param maxChar Maximum accepted Unicode character code
|
||||
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
|
||||
* @return Count of invalid UTF-8 sequences that were replaced
|
||||
*/
|
||||
int fixUtf8(const char* replace = 0, unsigned int maxSeq = 4, bool overlong = false);
|
||||
int fixUtf8(const char* replace = 0, unsigned int maxChar = 0x10ffff, bool overlong = false);
|
||||
|
||||
/**
|
||||
* Check if a string starts with UTF-8 Byte Order Mark
|
||||
|
@ -2057,6 +2148,11 @@ public:
|
|||
*/
|
||||
String& operator>>(char& store);
|
||||
|
||||
/**
|
||||
* Stream style extraction operator for single Unicode characters
|
||||
*/
|
||||
String& operator>>(UChar& store);
|
||||
|
||||
/**
|
||||
* Stream style extraction operator for integers
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue