]> git.tdb.fi Git - libs/core.git/blobdiff - source/stringcodec/utf8.cpp
Refuse to encode invalid Unicode characters in UTF-8
[libs/core.git] / source / stringcodec / utf8.cpp
index c7e1705f814e2d099d3b0180730c57e3aa210d85..b75b39780edfa6269ad09fa780ca9a565f574872 100644 (file)
@@ -1,21 +1,14 @@
-/* $Id$
-
-This file is part of libmspstrings
-Copyright © 2006-2007 Mikko Rasa
-Distributed under the LGPL
-*/
-
 #include "utf8.h"
 
 using namespace std;
 
 namespace Msp {
-namespace Codecs {
+namespace StringCodec {
 
-void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
+void Utf8::Encoder::encode_char(unichar ch, string &buf)
 {
-       if(ch<0 || ch>0x10FFFF)
-               return error(ch, buf, "Can't express character in UTF-8");
+       if(!is_valid_unichar(ch))
+               return error(ch, buf, invalid_character(ch, "UTF-8"));
 
        unsigned bytes = 1;
        if(ch>0xFFFF)
@@ -42,20 +35,20 @@ void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
        }
 }
 
-void Utf8::Encoder::transliterate(UnicodeChar, string &buf)
+void Utf8::Encoder::transliterate(unichar, string &buf)
 {
-       buf.append("\357\277\275", 3);  // � U+FFFE Replacement Character
+       buf.append("\357\277\275", 3);  // � U+FFFD Replacement Character
 }
 
 
-UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
+unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
 {
        if(i==str.end())
-               return error("No input");
+               return -1;
 
        if((*i&0xC0)==0x80)
        {
-               UnicodeChar result = error("UTF-8 tail byte found when expecting head");
+               unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte"));
                ++i;
                return result;
        }
@@ -68,18 +61,18 @@ UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator
 
                string::const_iterator j = i;
 
-               UnicodeChar result = (*j++)&(mask-1);
+               unichar result = (*j++)&(mask-1);
 
                unsigned k;
                for(k=1; (k<bytes && j!=str.end() && (*j&0xC0)==0x80); ++k)
                        result = (result<<6) | ((*j++)&0x3F);
 
                if(k<bytes)
-                       result = error("Incomplete UTF-8 character");
+                       result = error(invalid_sequence(i, j, "incomplete UTF-8 character"));
                else if(!(result>>(bytes*5-4)) || !(result>>7))
-                       result = error("Denormalized UTF-8 multibyte sequence");
-               else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF))
-                       result = error("Invalid Unicode code point");
+                       result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence"));
+               else if(!is_valid_unichar(result))
+                       result = error(invalid_sequence(i, j, "undefined UTF-8 character"));
 
                i = j;
                return result;
@@ -88,5 +81,5 @@ UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator
                return *i++;
 }
 
-} // namespace Codecs
+} // namespace StringCodec
 } // namespace Msp