Add UTF-16 encoding support to UChar class.

git-svn-id: http://voip.null.ro/svn/yate@6313 acf43c95-373e-0410-b603-e72c3f656dc1
This commit is contained in:
oana 2018-04-25 12:16:54 +00:00
parent 751fcc46cd
commit c1e5a20c9e
2 changed files with 223 additions and 0 deletions

View File

@ -25,6 +25,15 @@
#include <stdio.h>
#include <regex.h>
#if (defined(WORDS_BIGENDIAN) || defined(BIGENDIAN))
#define ENDIANNESS_NATIVE (UChar::BE)
#define ENDIANNESS_OPPOSITE (UChar::LE)
#else
#define ENDIANNESS_NATIVE (UChar::LE)
#define ENDIANNESS_OPPOSITE (UChar::BE)
#endif
namespace TelEngine {
// String to regular integer conversion, takes into account overflows
@ -251,6 +260,145 @@ bool UChar::decode(const char*& str, uint32_t maxChar, bool overlong)
return true;
}
static inline uint16_t swap_u16(uint16_t val, UChar::Endianness order)
{
if (order < UChar::Native && ENDIANNESS_OPPOSITE == order)
val = ((val & 0xff00) >> 8) | ((val & 0x00ff) << 8);
return val;
}
bool UChar::decode(uint16_t*& buff, unsigned int& len, Endianness order, uint32_t maxChar)
{
operator=('\0');
if (!(buff && len))
return false;
if (maxChar < 128)
maxChar = 0x10ffff; // RFC 3629 default limit
uint32_t val = swap_u16(*buff,order);
buff++;
len--;
if (val >= 0xD800 && val < 0xDC00 && len) { // High surrogate
uint16_t low = swap_u16(*buff,order);
if (low >= 0xDC00 && low <= 0xDFFF) {
buff++;
len--;
val = (low - 0xDC00) + (val - 0xD800) * 0x400 + 0x10000;
}
}
operator=(val);
if (code() > maxChar)
return false;
return true;
}
bool UChar::decode(DataBlock& buff, Endianness order, uint32_t maxChar)
{
operator=('\0');
unsigned int len = buff.length();
uint16_t* in = (uint16_t*) buff.data();
if (!len || (len & 1))
return false;
len = len >> 1;
if (!decode(in,len,order,maxChar))
return false;
buff.cut(-(buff.length() - len * 2));
return true;
}
bool UChar::encode(uint16_t*& buff, unsigned int& len, Endianness order)
{
if (!(buff && len && code() <= 0x10ffff))
return false;
XDebug(DebugAll,"UChar::encode() UTF-16, char=%s (%x), order=%u",c_str(),m_chr,order);
if (m_chr >= 0x10000) { // encode to surrogate pairs
if (len < 2)
return false; // not enough space to encode
*buff = swap_u16(((m_chr - 0x10000) >> 10) + 0xD800,order);
*(buff + 1) = swap_u16(((m_chr - 0x10000) & 0x3ff) + 0xDC00,order);
buff += 2;
len -=2;
}
else {
*buff = swap_u16(m_chr,order);
buff++;
len--;
}
return true;
}
bool UChar::encode(DataBlock& buff, Endianness order)
{
uint16_t b[2] = {0};
uint16_t* out = b;
unsigned int len = 2;
if (!encode(out,len,order))
return false;
buff.append(b,sizeof(uint16_t) * (2 - len));
return true;
}
bool UChar::decode(String& out, uint16_t*& buff, unsigned int& len, Endianness order, bool checkBOM, uint32_t maxChar)
{
if (!(buff && len))
return false;
XDebug(DebugAll,"UChar::decode() UTF-16, out=%s, buff=%p, len=%u, order=%u, maxChar=%x",
out.c_str(),buff,len,order,maxChar);
if (checkBOM && (*buff == 0xfeff || *buff == 0xfffe)) {
if (*buff == 0xfeff) // same endianness
order = ENDIANNESS_NATIVE;
else
order = ENDIANNESS_OPPOSITE;
buff++;
len--;
}
while (buff && len) {
UChar c;
if (!c.decode(buff,len,(Endianness)order,maxChar))
return false;
out << c;
}
return true;
}
bool UChar::encode(DataBlock& out, const char*& str, Endianness order, bool addBOM)
{
XDebug(DebugAll,"UChar::encode() UTF-16, str=%s, order=%u, addBOM=%s",str,order,String::boolText(addBOM));
if (TelEngine::null(str))
return false;
if (addBOM) {
uint16_t bom = swap_u16(0xfeff,order);
out.append(&bom,2);
}
UChar c;
while (*str && c.decode(str)) {
if (!c.encode(out,order))
return false;
}
return true;
}
bool UChar::encode(uint16_t*& buff, unsigned int& len, const char*& str, Endianness order, bool addBOM)
{
if (TelEngine::null(str))
return false;
if (!(buff && len))
return false;
XDebug(DebugAll,"UChar::encode() UTF-16, buff=%p, len=%u, str=%s, order=%u, addBOM=%s",
buff,len,str,order,String::boolText(addBOM));
if (addBOM) {
uint16_t bom = swap_u16(0xfeff,order);
*buff = bom;
++buff;
len--;
}
UChar c;
while (*str && c.decode(str)) {
if (!c.encode(buff,len,order))
return false;
}
return true;
}
StringMatchPrivate::StringMatchPrivate()
{

View File

@ -693,6 +693,7 @@ struct TokenDict {
};
class String;
class DataBlock;
class Mutex;
class ObjList;
class NamedCounter;
@ -1738,6 +1739,11 @@ class StringMatchPrivate;
class YATE_API UChar
{
public:
enum Endianness {
LE = 0,
BE = 1,
Native = 2,
};
/**
* Constructor from unsigned numeric code
* @param code Code of the Unicode character
@ -1816,6 +1822,75 @@ public:
*/
bool decode(const char*& str, uint32_t maxChar = 0x10ffff, bool overlong = false);
/**
* Decode the first Unicode character from an UTF-16 string
* @param buff Input buffer, advanced if decoding succeeds
* @param len Length of input buffer, updated if decoding succeeds
* @param order Endianness to use for decoding the character
* @param maxChar Maximum accepted Unicode character code
* @return True if decoding succeeded, false otherwise
*/
bool decode(uint16_t*& buff, unsigned int& len, Endianness order, uint32_t maxChar = 0x10ffff);
/**
* Decode the first Unicode character from an UTF-16 string
* @param buff Input buffer from which to decode the character
* @param order Endianness to use for decoding the character
* @param maxChar Maximum accepted Unicode character code
* @return True if decoding succeeded, false otherwise
*/
bool decode(DataBlock& buff, Endianness order, uint32_t maxChar = 0x10ffff);
/**
* Encode the Unicode character to UTF-16 into a given buffer
* @param buff Buffer where to put encoded character, advanced after encoding
* @param len Available space in given buffer, updated after encoding
* @param order Endianness to use for encoding the character
* @return True if decoding succeeded, false otherwise
*/
bool encode(uint16_t*& buff, unsigned int& len, Endianness order);
/**
* Encode the Unicode character to UTF-16 into a DataBlock
* @param buff DataBlock to which the encoded character is to be appended
* @param order Endianness to use for encoding the character
* @return True if decoding succeeded, false otherwise
*/
bool encode(DataBlock& buff, Endianness order);
/**
* Decode a UTF-16 encoded string
* @param out String to append the decoded characters to
* @param buff Input buffer to decode, advanced as decoding occurs
* @param len Length of input buffer, decremented as decoding occurs
* @param order Endianness to use for decoding
* @param checkBOM Check for presence of BOM and interpret accordingly if present
* @param maxChar Maximum accepted Unicode character code
* @return True if decoding succeeded, false otherwise
*/
static bool decode(String& out, uint16_t*& buff, unsigned int& len, Endianness order, bool checkBOM = false, uint32_t maxChar = 0x10ffff);
/**
* Encode a string to UTF-16
* @param out DataBlock to which encoded data is to be appended
* @param str String to be encoded
* @param order Endianness to use for encoding the character
* @param addBOM True to add BOM to the resulting encoding
* @return True if encoding succeeded, false otherwise
*/
static bool encode(DataBlock& out, const char*& str, Endianness order, bool addBOM = false);
/**
* Encode a string to UTF-16 into a given buffer
* @param buff Buffer where to put encoded character, advanced after encoding
* @param len Available space in given buffer, updated after encoding
* @param str String to be encoded
* @param order Endianness to use for encoding the character
* @param addBOM True to add BOM to the resulting encoding
* @return True if encoding succeeded, false otherwise
*/
static bool encode(uint16_t*& buff, unsigned int& len, const char*& str, Endianness order, bool addBOM = false);
private:
void encode();
uint32_t m_chr;