X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fstringcodec%2Futf8.cpp;h=b75b39780edfa6269ad09fa780ca9a565f574872;hp=c7e1705f814e2d099d3b0180730c57e3aa210d85;hb=8245b8036c8bdc51625616ca6248b0f2b0271dc0;hpb=b42ed73a1b241c0e93ee03c43c4584b41c549bac diff --git a/source/stringcodec/utf8.cpp b/source/stringcodec/utf8.cpp index c7e1705..b75b397 100644 --- a/source/stringcodec/utf8.cpp +++ b/source/stringcodec/utf8.cpp @@ -1,21 +1,14 @@ -/* $Id$ - -This file is part of libmspstrings -Copyright © 2006-2007 Mikko Rasa -Distributed under the LGPL -*/ - #include "utf8.h" using namespace std; namespace Msp { -namespace Codecs { +namespace StringCodec { -void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf) +void Utf8::Encoder::encode_char(unichar ch, string &buf) { - if(ch<0 || ch>0x10FFFF) - return error(ch, buf, "Can't express character in UTF-8"); + if(!is_valid_unichar(ch)) + return error(ch, buf, invalid_character(ch, "UTF-8")); unsigned bytes = 1; if(ch>0xFFFF) @@ -42,20 +35,20 @@ void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf) } } -void Utf8::Encoder::transliterate(UnicodeChar, string &buf) +void Utf8::Encoder::transliterate(unichar, string &buf) { - buf.append("\357\277\275", 3); // � U+FFFE Replacement Character + buf.append("\357\277\275", 3); // � U+FFFD Replacement Character } -UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) +unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) { if(i==str.end()) - return error("No input"); + return -1; if((*i&0xC0)==0x80) { - UnicodeChar result = error("UTF-8 tail byte found when expecting head"); + unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte")); ++i; return result; } @@ -68,18 +61,18 @@ UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator string::const_iterator j = i; - UnicodeChar result = (*j++)&(mask-1); + unichar result = (*j++)&(mask-1); unsigned k; for(k=1; (k>(bytes*5-4)) || !(result>>7)) - result = error("Denormalized UTF-8 multibyte sequence"); - else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF)) - result = error("Invalid Unicode code point"); + result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence")); + else if(!is_valid_unichar(result)) + result = error(invalid_sequence(i, j, "undefined UTF-8 character")); i = j; return result; @@ -88,5 +81,5 @@ UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator return *i++; } -} // namespace Codecs +} // namespace StringCodec } // namespace Msp