]> git.tdb.fi Git - libs/core.git/blobdiff - source/stringcodec/utf8.cpp
Refuse to encode invalid Unicode characters in UTF-8
[libs/core.git] / source / stringcodec / utf8.cpp
index 19fe488282561dafbe1d4b788ec3bee0b8cb1b18..b75b39780edfa6269ad09fa780ca9a565f574872 100644 (file)
@@ -7,8 +7,8 @@ namespace StringCodec {
 
 void Utf8::Encoder::encode_char(unichar ch, string &buf)
 {
-       if(ch<0 || ch>0x10FFFF)
-               return error(ch, buf, "Can't express character in UTF-8");
+       if(!is_valid_unichar(ch))
+               return error(ch, buf, invalid_character(ch, "UTF-8"));
 
        unsigned bytes = 1;
        if(ch>0xFFFF)
@@ -48,7 +48,7 @@ unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
 
        if((*i&0xC0)==0x80)
        {
-               unichar result = error("UTF-8 tail byte found when expecting head");
+               unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte"));
                ++i;
                return result;
        }
@@ -68,11 +68,11 @@ unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
                        result = (result<<6) | ((*j++)&0x3F);
 
                if(k<bytes)
-                       result = error("Incomplete UTF-8 character");
+                       result = error(invalid_sequence(i, j, "incomplete UTF-8 character"));
                else if(!(result>>(bytes*5-4)) || !(result>>7))
-                       result = error("Denormalized UTF-8 multibyte sequence");
-               else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF))
-                       result = error("Invalid Unicode code point");
+                       result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence"));
+               else if(!is_valid_unichar(result))
+                       result = error(invalid_sequence(i, j, "undefined UTF-8 character"));
 
                i = j;
                return result;