X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fstringcodec%2Futf8.cpp;fp=source%2Fstringcodec%2Futf8.cpp;h=c7e1705f814e2d099d3b0180730c57e3aa210d85;hp=0000000000000000000000000000000000000000;hb=b42ed73a1b241c0e93ee03c43c4584b41c549bac;hpb=5b1368cb791cab043f0435628cacbaff36e39b7b diff --git a/source/stringcodec/utf8.cpp b/source/stringcodec/utf8.cpp new file mode 100644 index 0000000..c7e1705 --- /dev/null +++ b/source/stringcodec/utf8.cpp @@ -0,0 +1,92 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2006-2007 Mikko Rasa +Distributed under the LGPL +*/ + +#include "utf8.h" + +using namespace std; + +namespace Msp { +namespace Codecs { + +void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf) +{ + if(ch<0 || ch>0x10FFFF) + return error(ch, buf, "Can't express character in UTF-8"); + + unsigned bytes = 1; + if(ch>0xFFFF) + bytes = 4; + else if(ch>0x7FF) + bytes = 3; + else if(ch>0x7F) + bytes = 2; + + if(bytes==1) + buf += ch; + else + { + char utf[4]; + + utf[0] = 0xFF<<(8-bytes) | ch>>(bytes*6-6); + for(unsigned j=bytes-1; j>0; --j) + { + utf[j] = 0x80 | (ch&0x3F); + ch >>= 6; + } + + buf.append(utf, bytes); + } +} + +void Utf8::Encoder::transliterate(UnicodeChar, string &buf) +{ + buf.append("\357\277\275", 3); // � U+FFFE Replacement Character +} + + +UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) +{ + if(i==str.end()) + return error("No input"); + + if((*i&0xC0)==0x80) + { + UnicodeChar result = error("UTF-8 tail byte found when expecting head"); + ++i; + return result; + } + else if(*i&0x80) + { + unsigned bytes = 2; + unsigned mask = 0x20; + for(; *i&mask; mask>>=1) + ++bytes; + + string::const_iterator j = i; + + UnicodeChar result = (*j++)&(mask-1); + + unsigned k; + for(k=1; (k>(bytes*5-4)) || !(result>>7)) + result = error("Denormalized UTF-8 multibyte sequence"); + else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF)) + result = error("Invalid Unicode code point"); + + i = j; + return result; + } + else + return *i++; +} + +} // namespace Codecs +} // namespace Msp