]> git.tdb.fi Git - libs/core.git/blobdiff - source/utf8.cpp
Further style and comment adjustments
[libs/core.git] / source / utf8.cpp
index 0217790277a2a1b14112668552cc1675125028d6..c7e1705f814e2d099d3b0180730c57e3aa210d85 100644 (file)
@@ -1,92 +1,92 @@
+/* $Id$
+
+This file is part of libmspstrings
+Copyright © 2006-2007 Mikko Rasa
+Distributed under the LGPL
+*/
+
 #include "utf8.h"
 
 using namespace std;
 
 namespace Msp {
+namespace Codecs {
 
-void Utf8::Encoder::encode_char(wchar_t c)
+void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
 {
-       unsigned code=c;
-       if(code>0x10FFFF)
-               throw CodecError("Can't express character in UTF-8");
-
-       unsigned bytes=1;
-       if(code>0xFFFF)
-               bytes=4;
-       else if(code>0x7FF)
-               bytes=3;
-       else if(code>0x7F)
-               bytes=2;
+       if(ch<0 || ch>0x10FFFF)
+               return error(ch, buf, "Can't express character in UTF-8");
+
+       unsigned bytes = 1;
+       if(ch>0xFFFF)
+               bytes = 4;
+       else if(ch>0x7FF)
+               bytes = 3;
+       else if(ch>0x7F)
+               bytes = 2;
 
        if(bytes==1)
-               append(code);
+               buf += ch;
        else
        {
-               char buf[4];
-               
-               buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6);
+               char utf[4];
+
+               utf[0] = 0xFF<<(8-bytes) | ch>>(bytes*6-6);
                for(unsigned j=bytes-1; j>0; --j)
                {
-                       buf[j]=0x80 | code&0x3F;
-                       code>>=6;
+                       utf[j] = 0x80 | (ch&0x3F);
+                       ch >>= 6;
                }
 
-               append(buf, bytes);
+               buf.append(utf, bytes);
        }
 }
 
+void Utf8::Encoder::transliterate(UnicodeChar, string &buf)
+{
+       buf.append("\357\277\275", 3);  // � U+FFFE Replacement Character
+}
 
-void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
+
+UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
 {
-       while(i!=str.end())
+       if(i==str.end())
+               return error("No input");
+
+       if((*i&0xC0)==0x80)
        {
-               if(bytes==0)
-               {
-                       if((*i&0xC0)==0x80)
-                               throw CodecError("Invalid UTF-8 string (tail byte when expecting head)");
-
-                       else if(*i&0x80)
-                       {
-                               unsigned mask=0x40;
-                               for(; *i&mask; mask>>=1)
-                                       ++bytes;
-
-                               if(bytes>3)
-                                       throw CodecError("Invalid UTF-8 string (overlong multibyte sequence)");
-
-                               code=(*i++)&(mask-1);
-                               if(!code)
-                                       throw CodecError("Invalid UTF-8 string (denormalized multibyte sequence)");
-                       }
-                       else
-                       {
-                               append(*i++);
-                               break;
-                       }
-               }
-               else
-               {
-                       if((*i&0xC0)!=0x80)
-                               throw CodecError("Invalid UTF-8 string (head byte when expecting tail)");
-
-                       code=code<<6 | (*i++)&0x3F;
-                       --bytes;
-
-                       if(!bytes)
-                       {
-                               if(code>0x10FFFF)
-                                       throw CodecError("Invalid UTF-8 string (character out of range)");
-                               append(code);
-                               break;
-                       }
-               }
+               UnicodeChar result = error("UTF-8 tail byte found when expecting head");
+               ++i;
+               return result;
        }
-}
+       else if(*i&0x80)
+       {
+               unsigned bytes = 2;
+               unsigned mask = 0x20;
+               for(; *i&mask; mask>>=1)
+                       ++bytes;
 
-void Utf8::Decoder::sync()
-{
-       if(bytes)
-               throw CodecError("Sync in the middle of multibyte UTF-8 sequence");
+               string::const_iterator j = i;
+
+               UnicodeChar result = (*j++)&(mask-1);
+
+               unsigned k;
+               for(k=1; (k<bytes && j!=str.end() && (*j&0xC0)==0x80); ++k)
+                       result = (result<<6) | ((*j++)&0x3F);
+
+               if(k<bytes)
+                       result = error("Incomplete UTF-8 character");
+               else if(!(result>>(bytes*5-4)) || !(result>>7))
+                       result = error("Denormalized UTF-8 multibyte sequence");
+               else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF))
+                       result = error("Invalid Unicode code point");
+
+               i = j;
+               return result;
+       }
+       else
+               return *i++;
 }
 
+} // namespace Codecs
 } // namespace Msp