using namespace std;
namespace Msp {
+namespace Codecs {
-void Utf8::Encoder::encode_char(wchar_t c)
+void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
{
- unsigned code=c;
- if(code>0x10FFFF)
- {
- error("Can't express character in UTF-8");
- return;
- }
+ if(ch<0 || ch>0x10FFFF)
+ return error(ch, buf, "Can't express character in UTF-8");
unsigned bytes=1;
- if(code>0xFFFF)
+ if(ch>0xFFFF)
bytes=4;
- else if(code>0x7FF)
+ else if(ch>0x7FF)
bytes=3;
- else if(code>0x7F)
+ else if(ch>0x7F)
bytes=2;
if(bytes==1)
- append(code);
+ buf+=ch;
else
{
- char buf[4];
-
- buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6);
+ char utf[4];
+
+ utf[0]=0xFF<<(8-bytes) | ch>>(bytes*6-6);
for(unsigned j=bytes-1; j>0; --j)
{
- buf[j]=0x80 | code&0x3F;
- code>>=6;
+ utf[j]=0x80 | (ch&0x3F);
+ ch>>=6;
}
- append(buf, bytes);
+ buf.append(utf, bytes);
}
}
-
-void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
+void Utf8::Encoder::transliterate(UnicodeChar, string &buf)
{
- while(i!=str.end())
- {
- if(bytes==0)
- {
- if((*i&0xC0)==0x80)
- {
- error("Invalid UTF-8 string (tail byte when expecting head)");
- ++i;
- break;
- }
- else if(*i&0x80)
- {
- unsigned mask=0x40;
- for(; *i&mask; mask>>=1)
- ++bytes;
-
- if(bytes>3)
- {
- error("Invalid UTF-8 string (overlong multibyte sequence)");
- ++i;
- break;
- }
- else
- {
- code=(*i++)&(mask-1);
- if(!code)
- {
- error("Invalid UTF-8 string (denormalized multibyte sequence)");
- break;
- }
- }
- }
- else
- {
- append(*i++);
- break;
- }
- }
- else
- {
- if((*i&0xC0)!=0x80)
- {
- error("Invalid UTF-8 string (head byte when expecting tail)");
- ++i;
- break;
- }
-
- code=code<<6 | (*i++)&0x3F;
- --bytes;
-
- if(!bytes)
- {
- if(code>0x10FFFF)
- error("Invalid UTF-8 string (character out of range)");
- else
- append(code);
- break;
- }
- }
- }
+ buf.append("\357\277\275", 3); // � U+FFFE Replacement Character
}
-void Utf8::Decoder::sync()
+
+UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
{
- if(bytes)
+ if(i==str.end())
+ return error("No input");
+
+ if((*i&0xC0)==0x80)
{
- error("Sync in the middle of multibyte UTF-8 sequence");
- bytes=0;
+ UnicodeChar result=error("UTF-8 tail byte found when expecting head");
+ ++i;
+ return result;
}
-}
+ else if(*i&0x80)
+ {
+ unsigned bytes=2;
+ unsigned mask=0x20;
+ for(; *i&mask; mask>>=1)
+ ++bytes;
-void Utf8::Decoder::reset()
-{
- bytes=0;
+ string::const_iterator j=i;
+
+ UnicodeChar result=(*j++)&(mask-1);
+
+ unsigned k;
+ for(k=1; (k<bytes && j!=str.end() && (*j&0xC0)==0x80); ++k)
+ result=(result<<6) | ((*j++)&0x3F);
+
+ if(k<bytes)
+ result=error("Incomplete UTF-8 character");
+ else if(!(result>>(bytes*5-4)) || !(result>>7))
+ result=error("Denormalized UTF-8 multibyte sequence");
+ else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF))
+ result=error("Invalid Unicode code point");
+
+ i=j;
+ return result;
+ }
+ else
+ return *i++;
}
+} // namespace Codecs
} // namespace Msp