Added class to handle Unicode characters and their UTF-8 conversions.

Made String handling of UTF-8 and Unicode easier.


git-svn-id: http://voip.null.ro/svn/yate@5507 acf43c95-373e-0410-b603-e72c3f656dc1
This commit is contained in:
paulc 2013-05-20 15:00:26 +00:00
parent fef8255f07
commit 5c91367b63
2 changed files with 256 additions and 40 deletions

View File

@ -139,6 +139,122 @@ static inline char hexEncode(char nib)
}
void UChar::encode()
{
if (m_chr < 0x80) {
m_str[0] = (char)m_chr;
m_str[1] = '\0';
}
else if (m_chr < 0x800) {
m_str[0] = (char)(0xc0 | ((m_chr >> 6) & 0x1f));
m_str[1] = (char)(0x80 | (m_chr & 0x3f));
m_str[2] = '\0';
}
else if (m_chr < 0xffff) {
m_str[0] = (char)(0xe0 | ((m_chr >> 12) & 0x0f));
m_str[1] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
m_str[2] = (char)(0x80 | (m_chr & 0x3f));
m_str[3] = '\0';
}
else if (m_chr < 0x1fffff) {
m_str[0] = (char)(0xf0 | ((m_chr >> 18) & 0x07));
m_str[1] = (char)(0x80 | ((m_chr >> 12) & 0x3f));
m_str[2] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
m_str[3] = (char)(0x80 | (m_chr & 0x3f));
m_str[4] = '\0';
}
else if (m_chr < 0x3ffffff) {
m_str[0] = (char)(0xf8 | ((m_chr >> 24) & 0x03));
m_str[1] = (char)(0x80 | ((m_chr >> 18) & 0x3f));
m_str[2] = (char)(0x80 | ((m_chr >> 12) & 0x3f));
m_str[3] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
m_str[4] = (char)(0x80 | (m_chr & 0x3f));
m_str[5] = '\0';
}
else if (m_chr < 0x7fffffff) {
m_str[0] = (char)(0xfc | ((m_chr >> 30) & 0x01));
m_str[1] = (char)(0x80 | ((m_chr >> 24) & 0x3f));
m_str[2] = (char)(0x80 | ((m_chr >> 18) & 0x3f));
m_str[3] = (char)(0x80 | ((m_chr >> 12) & 0x3f));
m_str[4] = (char)(0x80 | ((m_chr >> 6) & 0x3f));
m_str[5] = (char)(0x80 | (m_chr & 0x3f));
m_str[6] = '\0';
}
else
m_str[0] = '\0';
}
bool UChar::decode(const char*& str, unsigned int maxChar, bool overlong)
{
operator=('\0');
if (!str)
return false;
if (maxChar < 128)
maxChar = 0x10ffff; // RFC 3629 default limit
unsigned int more = 0;
u_int32_t min = 0;
u_int32_t val = 0;
unsigned char c = (unsigned char)*str++;
// from 1st byte we find out how many are supposed to follow
if (!c) // don't advance past NUL
--str;
else if (c < 0x80) // 1 byte, 0...0x7F, ASCII characters
val = c & 0x7f;
else if (c < 0xc0) // invalid as first UFT-8 byte
return false;
else if (c < 0xe0) {
// 2 bytes, 0x80...0x7FF
min = 0x80;
val = c & 0x1f;
more = 1;
}
else if (c < 0xf0) {
// 3 bytes, 0x800...0xFFFF, Basic Multilingual Plane
min = 0x800;
val = c & 0x0f;
more = 2;
}
else if (c < 0xf8) {
// 4 bytes, 0x10000...0x1FFFFF, RFC 3629 limit (10FFFF)
min = 0x10000;
val = c & 0x07;
more = 3;
}
else if (c < 0xfc) {
// 5 bytes, 0x200000...0x3FFFFFF
min = 0x200000;
val = c & 0x03;
more = 4;
}
else if (c < 0xfe) {
// 6 bytes, 0x4000000...0x7FFFFFFF
min = 0x4000000;
val = c & 0x01;
more = 5;
}
else
return false;
while (more--) {
c = (unsigned char)*str;
// all continuation bytes are in range [128..191]
if ((c & 0xc0) != 0x80)
return false;
val = (val << 6) | (c & 0x3f);
++str;
}
operator=(val);
// got full value, check for overlongs and out of range
if (val > maxChar)
return false;
if (val < min && !overlong)
return false;
return true;
}
StringMatchPrivate::StringMatchPrivate()
{
XDebug(DebugAll,"StringMatchPrivate::StringMatchPrivate() [%p]",this);
@ -649,6 +765,14 @@ String& String::operator>>(char& store)
return *this;
}
String& String::operator>>(UChar& store)
{
const char* str = m_string;
store.decode(str);
assign(str);
return *this;
}
String& String::operator>>(int& store)
{
if (m_string) {
@ -1166,17 +1290,17 @@ unsigned int String::hash(const char* value)
return h;
}
int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
int String::lenUtf8(const char* value, unsigned int maxChar, bool overlong)
{
if (!value)
return 0;
if (maxSeq < 1)
maxSeq = 4; // RFC 3629 default limit
if (maxChar < 128)
maxChar = 0x10ffff; // RFC 3629 default limit
int count = 0;
unsigned int more = 0;
int32_t min = 0;
int32_t val = 0;
u_int32_t min = 0;
u_int32_t val = 0;
while (unsigned char c = (unsigned char) *value++) {
if (more) {
@ -1185,9 +1309,11 @@ int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
return -1;
val = (val << 6) | (c & 0x3f);
if (!--more) {
// got full value, check for overlongs and out of range
if (val > maxChar)
return -1;
if (overlong)
continue;
// got full value, check for overlongs
if (val < min)
return -1;
}
@ -1195,35 +1321,35 @@ int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
}
count++;
// from 1st byte we find out how many are supposed to follow
if (c < 128) // 1 byte, 0...0x7F, ASCII characters, no check
if (c < 0x80) // 1 byte, 0...0x7F, ASCII characters, no check
;
else if (c < 192) // invalid as first UFT-8 byte
else if (c < 0xc0) // invalid as first UFT-8 byte
return -1;
else if (c < 224) {
else if (c < 0xe0) {
// 2 bytes, 0x80...0x7FF
min = 0x80;
val = c & 0x1f;
more = 1;
}
else if (c < 240) {
else if (c < 0xf0) {
// 3 bytes, 0x800...0xFFFF, Basic Multilingual Plane
min = 0x800;
val = c & 0x0f;
more = 2;
}
else if (c < 248) {
else if (c < 0xf8) {
// 4 bytes, 0x10000...0x1FFFFF, RFC 3629 limit (10FFFF)
min = 0x10000;
val = c & 0x07;
more = 3;
}
else if (c < 252) {
else if (c < 0xfc) {
// 5 bytes, 0x200000...0x3FFFFFF
min = 0x200000;
val = c & 0x03;
more = 4;
}
else if (c < 254) {
else if (c < 0xfe) {
// 6 bytes, 0x4000000...0x7FFFFFFF
min = 0x4000000;
val = c & 0x01;
@ -1231,28 +1357,25 @@ int String::lenUtf8(const char* value, unsigned int maxSeq, bool overlong)
}
else
return -1;
// check if we accept a character with such sequence length
if (more >= maxSeq)
return -1;
}
if (more)
return -1;
return count;
}
int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
int String::fixUtf8(const char* replace, unsigned int maxChar, bool overlong)
{
if (null())
return 0;
if (maxSeq < 1)
maxSeq = 4; // RFC 3629 default limit
if (maxChar < 128)
maxChar = 0x10ffff; // RFC 3629 default limit
if (!replace)
replace = "\xEF\xBF\xBD";
int count = 0;
unsigned int more = 0;
int32_t min = 0;
int32_t val = 0;
u_int32_t min = 0;
u_int32_t val = 0;
unsigned int pos = 0;
bool bad = false;
String tmp;
@ -1260,9 +1383,6 @@ int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
for (unsigned int i = 0; i < m_length; i++) {
unsigned char c = (unsigned char) at(i);
if (more) {
// remember to reject a character with a too long sequence
if (more >= maxSeq)
bad = true;
// all continuation bytes are in range [128..191]
if ((c & 0xc0) != 0x80) {
// truncated sequence, must search for 1st byte again
@ -1273,8 +1393,8 @@ int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
else {
val = (val << 6) | (c & 0x3f);
if (!--more) {
// got full value, check for overlongs
if ((val < min) && !overlong)
// got full value, check for overlongs and out of range
if ((val > maxChar) || ((val < min) && !overlong))
bad = true;
// finished multibyte, add it to temporary
if (bad) {
@ -1290,35 +1410,35 @@ int String::fixUtf8(const char* replace, unsigned int maxSeq, bool overlong)
pos = i;
bad = false;
// from 1st byte we find out how many are supposed to follow
if (c < 128) // 1 byte, 0...0x7F, ASCII characters, good
if (c < 0x80) // 1 byte, 0...0x7F, ASCII characters, good
;
else if (c < 192) // invalid as first UFT-8 byte
else if (c < 0xc0) // invalid as first UFT-8 byte
bad = true;
else if (c < 224) {
else if (c < 0xe0) {
// 2 bytes, 0x80...0x7FF
min = 0x80;
val = c & 0x1f;
more = 1;
}
else if (c < 240) {
else if (c < 0xf0) {
// 3 bytes, 0x800...0xFFFF, Basic Multilingual Plane
min = 0x800;
val = c & 0x0f;
more = 2;
}
else if (c < 248) {
else if (c < 0xf8) {
// 4 bytes, 0x10000...0x1FFFFF, RFC 3629 limit (10FFFF)
min = 0x10000;
val = c & 0x07;
more = 3;
}
else if (c < 252) {
else if (c < 0xfc) {
// 5 bytes, 0x200000...0x3FFFFFF
min = 0x200000;
val = c & 0x03;
more = 4;
}
else if (c < 254) {
else if (c < 0xfe) {
// 6 bytes, 0x4000000...0x7FFFFFFF
min = 0x4000000;
val = c & 0x01;

View File

@ -1558,6 +1558,97 @@ private:
class Regexp;
class StringMatchPrivate;
/**
* A simple class to hold a single Unicode character and convert it to / from UTF-8
* @short A single Unicode character
*/
class YATE_API UChar
{
public:
/**
* Constructor from unsigned numeric code
* @param code Code of the Unicode character
*/
inline explicit UChar(unsigned int code = 0)
: m_chr(code)
{ encode(); }
/**
* Constructor from signed numeric code
* @param code Code of the Unicode character
*/
inline explicit UChar(signed int code)
: m_chr((code < 0) ? 0 : code)
{ encode(); }
/**
* Constructor from signed character
* @param code Character to construct from
*/
inline explicit UChar(signed char code)
: m_chr((unsigned char)code)
{ encode(); }
/**
* Constructor from unsigned character
* @param code Character to construct from
*/
inline explicit UChar(unsigned char code)
: m_chr(code)
{ encode(); }
/**
* Assignment operator from a character code
* @param code Character code to assign
* @return Reference to this object
*/
inline UChar& operator=(unsigned int code)
{ m_chr = code; encode(); return *this; }
/**
* Assignment operator from a character
* @param code Character to assign
* @return Reference to this object
*/
inline UChar& operator=(char code)
{ m_chr = (unsigned char)code; encode(); return *this; }
/**
* Get the Unicode value of the character
* @return Code of the character as defined by Unicode
*/
inline unsigned int code() const
{ return m_chr; }
/**
* Get the value of the character as UTF-8 string.
* @return The character as UTF-8 C string
*/
inline const char* c_str() const
{ return m_str; }
/**
* Conversion to "const char *" operator.
* @return Pointer to the internally stored UTF-8 string
*/
inline operator const char*() const
{ return m_str; };
/**
* Decode the first Unicode character from an UTF-8 C string
* @param str String to extract from, will be advanced past the character
* @param maxChar Maximum accepted Unicode character code
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
* @return True if an Unicode character was decoded from string
*/
bool decode(const char*& str, unsigned int maxChar = 0x10ffff, bool overlong = false);
private:
void encode();
u_int32_t m_chr;
char m_str[8];
};
/**
* A simple string handling class for C style (one byte) strings.
* For simplicity and read speed no copy-on-write is performed.
@ -1682,30 +1773,30 @@ public:
/**
* Get the number of characters in a string assuming UTF-8 encoding
* @param value C string to compute Unicode length
* @param maxSeq Maximum accepted UTF-8 sequence length
* @param maxChar Maximum accepted Unicode character code
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
* @return Count of Unicode characters, -1 if not valid UTF-8
*/
static int lenUtf8(const char* value, unsigned int maxSeq = 4, bool overlong = false);
static int lenUtf8(const char* value, unsigned int maxChar = 0x10ffff, bool overlong = false);
/**
* Get the number of characters in the string assuming UTF-8 encoding
* @param maxSeq Maximum accepted UTF-8 sequence length
* @param maxChar Maximum accepted Unicode character code
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
* @return Count of Unicode characters, -1 if not valid UTF-8
*/
inline int lenUtf8(unsigned int maxSeq = 4, bool overlong = false) const
{ return lenUtf8(m_string,maxSeq,overlong); }
inline int lenUtf8(unsigned int maxChar = 0x10ffff, bool overlong = false) const
{ return lenUtf8(m_string,maxChar,overlong); }
/**
* Fix an UTF-8 encoded string by replacing invalid sequences
* @param replace String to replace invalid sequences, use U+FFFD if null
* @param maxSeq Maximum accepted UTF-8 sequence length
* @param maxChar Maximum accepted Unicode character code
* @param overlong Accept overlong UTF-8 sequences (dangerous!)
* @return Count of invalid UTF-8 sequences that were replaced
*/
int fixUtf8(const char* replace = 0, unsigned int maxSeq = 4, bool overlong = false);
int fixUtf8(const char* replace = 0, unsigned int maxChar = 0x10ffff, bool overlong = false);
/**
* Check if a string starts with UTF-8 Byte Order Mark
@ -2057,6 +2148,11 @@ public:
*/
String& operator>>(char& store);
/**
* Stream style extraction operator for single Unicode characters
*/
String& operator>>(UChar& store);
/**
* Stream style extraction operator for integers
*/