X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Futf8.cpp;h=dd01150803186dc68a4687668026a94e23b3dee4;hp=030406d3abcfc84dc32fd3d73a8812746d36d41d;hb=a0a5c796a6ec11a13c49912672a82bf1857bbc71;hpb=d2118ac101602cfe2d62fb7deb6ef3fcb0fe137b diff --git a/source/utf8.cpp b/source/utf8.cpp index 030406d..dd01150 100644 --- a/source/utf8.cpp +++ b/source/utf8.cpp @@ -1,120 +1,92 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2006-2007 Mikko Rasa +Distributed under the LGPL +*/ + #include "utf8.h" using namespace std; namespace Msp { +namespace Codecs { -void Utf8::Encoder::encode_char(wchar_t c) +void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf) { - unsigned code=c; - if(code>0x10FFFF) - { - error("Can't express character in UTF-8"); - return; - } + if(ch<0 || ch>0x10FFFF) + return error(ch, buf, "Can't express character in UTF-8"); unsigned bytes=1; - if(code>0xFFFF) + if(ch>0xFFFF) bytes=4; - else if(code>0x7FF) + else if(ch>0x7FF) bytes=3; - else if(code>0x7F) + else if(ch>0x7F) bytes=2; if(bytes==1) - append(code); + buf+=ch; else { - char buf[4]; - - buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6); + char utf[4]; + + utf[0]=0xFF<<(8-bytes) | ch>>(bytes*6-6); for(unsigned j=bytes-1; j>0; --j) { - buf[j]=0x80 | code&0x3F; - code>>=6; + utf[j]=0x80 | (ch&0x3F); + ch>>=6; } - append(buf, bytes); + buf.append(utf, bytes); } } - -void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) +void Utf8::Encoder::transliterate(UnicodeChar, string &buf) { - while(i!=str.end()) - { - if(bytes==0) - { - if((*i&0xC0)==0x80) - { - error("Invalid UTF-8 string (tail byte when expecting head)"); - ++i; - break; - } - else if(*i&0x80) - { - unsigned mask=0x40; - for(; *i&mask; mask>>=1) - ++bytes; - - if(bytes>3) - { - error("Invalid UTF-8 string (overlong multibyte sequence)"); - ++i; - break; - } - else - { - code=(*i++)&(mask-1); - if(!code) - { - error("Invalid UTF-8 string (denormalized multibyte sequence)"); - break; - } - } - } - else - { - append(*i++); - break; - } - } - else - { - if((*i&0xC0)!=0x80) - { - error("Invalid UTF-8 string (head byte when expecting tail)"); - ++i; - break; - } - - code=code<<6 | (*i++)&0x3F; - --bytes; - - if(!bytes) - { - if(code>0x10FFFF) - error("Invalid UTF-8 string (character out of range)"); - else - append(code); - break; - } - } - } + buf.append("\357\277\275", 3); // � U+FFFE Replacement Character } -void Utf8::Decoder::sync() + +UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) { - if(bytes) + if(i==str.end()) + return error("No input"); + + if((*i&0xC0)==0x80) { - error("Sync in the middle of multibyte UTF-8 sequence"); - bytes=0; + UnicodeChar result=error("UTF-8 tail byte found when expecting head"); + ++i; + return result; } -} + else if(*i&0x80) + { + unsigned bytes=2; + unsigned mask=0x20; + for(; *i&mask; mask>>=1) + ++bytes; -void Utf8::Decoder::reset() -{ - bytes=0; + string::const_iterator j=i; + + UnicodeChar result=(*j++)&(mask-1); + + unsigned k; + for(k=1; (k>(bytes*5-4)) || !(result>>7)) + result=error("Denormalized UTF-8 multibyte sequence"); + else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF)) + result=error("Invalid Unicode code point"); + + i=j; + return result; + } + else + return *i++; } +} // namespace Codecs } // namespace Msp