Refuse to encode invalid Unicode characters in UTF-8

[libs/core.git] / source / stringcodec / utf8.cpp
diff --git a/source/stringcodec/utf8.cpp b/source/stringcodec/utf8.cpp

index c7e1705f814e2d099d3b0180730c57e3aa210d85..b75b39780edfa6269ad09fa780ca9a565f574872 100644 (file)
--- a/source/stringcodec/utf8.cpp
+++ b/source/stringcodec/utf8.cpp
@@ -1,21 +1,14 @@
-/* $Id$
-
-This file is part of libmspstrings
-Copyright © 2006-2007 Mikko Rasa
-Distributed under the LGPL
-*/
-
  #include "utf8.h"
  
  using namespace std;
  
  namespace Msp {
-namespace Codecs {
+namespace StringCodec {
  
-void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
+void Utf8::Encoder::encode_char(unichar ch, string &buf)
  {
-       if(ch<0 || ch>0x10FFFF)
-               return error(ch, buf, "Can't express character in UTF-8");
+       if(!is_valid_unichar(ch))
+               return error(ch, buf, invalid_character(ch, "UTF-8"));
  
         unsigned bytes = 1;
         if(ch>0xFFFF)
@@ -42,20 +35,20 @@ void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
         }
  }
  
-void Utf8::Encoder::transliterate(UnicodeChar, string &buf)
+void Utf8::Encoder::transliterate(unichar, string &buf)
  {
-       buf.append("\357\277\275", 3);  // � U+FFFE Replacement Character
+       buf.append("\357\277\275", 3);  // � U+FFFD Replacement Character
  }
  
  
-UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
+unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
  {
         if(i==str.end())
-               return error("No input");
+               return -1;
  
         if((*i&0xC0)==0x80)
         {
-               UnicodeChar result = error("UTF-8 tail byte found when expecting head");
+               unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte"));
                 ++i;
                 return result;
         }
@@ -68,18 +61,18 @@ UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator
  
                 string::const_iterator j = i;
  
-               UnicodeChar result = (*j++)&(mask-1);
+               unichar result = (*j++)&(mask-1);
  
                 unsigned k;
                 for(k=1; (k<bytes && j!=str.end() && (*j&0xC0)==0x80); ++k)
                         result = (result<<6) | ((*j++)&0x3F);
  
                 if(k<bytes)
-                       result = error("Incomplete UTF-8 character");
+                       result = error(invalid_sequence(i, j, "incomplete UTF-8 character"));
                 else if(!(result>>(bytes*5-4)) || !(result>>7))
-                       result = error("Denormalized UTF-8 multibyte sequence");
-               else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF))
-                       result = error("Invalid Unicode code point");
+                       result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence"));
+               else if(!is_valid_unichar(result))
+                       result = error(invalid_sequence(i, j, "undefined UTF-8 character"));
  
                 i = j;
                 return result;
@@ -88,5 +81,5 @@ UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator
                 return *i++;
  }
  
-} // namespace Codecs
+} // namespace StringCodec
  } // namespace Msp