Rework the codec API completely to remove the internal buffering

[libs/core.git] / source / utf8.cpp
diff --git a/source/utf8.cpp b/source/utf8.cpp

index ce3b984edcf7c8062d6a9db6643be9c20c5557e8..2d0458f3254b96c69ff827c0f2e86f5a44ab220b 100644 (file)
--- a/source/utf8.cpp
+++ b/source/utf8.cpp
@@ -10,118 +10,91 @@ Distributed under the LGPL
  using namespace std;
  
  namespace Msp {
+namespace Codecs {
  
-void Utf8::Encoder::encode_char(wchar_t c)
+void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
  {
-       unsigned code=c;
-       if(code>0x10FFFF)
-       {
-               error("Can't express character in UTF-8");
-               return;
-       }
+       if(ch<0 || ch>0x10FFFF)
+               return error(ch, buf, "Can't express character in UTF-8");
  
         unsigned bytes=1;
-       if(code>0xFFFF)
+       if(ch>0xFFFF)
                 bytes=4;
-       else if(code>0x7FF)
+       else if(ch>0x7FF)
                 bytes=3;
-       else if(code>0x7F)
+       else if(ch>0x7F)
                 bytes=2;
  
         if(bytes==1)
-               append(code);
+               buf+=ch;
         else
         {
-               char buf[4];
-               
-               buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6);
+               char utf[4];
+
+               utf[0]=0xFF<<(8-bytes) | ch>>(bytes*6-6);
                 for(unsigned j=bytes-1; j>0; --j)
                 {
-                       buf[j]=0x80 | code&0x3F;
-                       code>>=6;
+                       utf[j]=0x80 | ch&0x3F;
+                       ch>>=6;
                 }
  
-               append(buf, bytes);
+               buf.append(utf, bytes);
         }
  }
  
+void Utf8::Encoder::transliterate(UnicodeChar, string &buf)
+{
+       buf.append("\357\277\275", 3);  // � U+FFFE Replacement Character
+}
+
  
-void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
+UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
  {
-       while(i!=str.end())
+       if(i==str.end())
+               return error("No input");
+
+       if((*i&0xC0)==0x80)
         {
-               if(bytes==0)
-               {
-                       if((*i&0xC0)==0x80)
-                       {
-                               error("Invalid UTF-8 string (tail byte when expecting head)");
-                               ++i;
-                               break;
-                       }
-                       else if(*i&0x80)
-                       {
-                               unsigned mask=0x40;
-                               for(; *i&mask; mask>>=1)
-                                       ++bytes;
-
-                               if(bytes>3)
-                               {
-                                       error("Invalid UTF-8 string (overlong multibyte sequence)");
-                                       ++i;
-                                       break;
-                               }
-                               else
-                               {
-                                       code=(*i++)&(mask-1);
-                                       if(!code)
-                                       {
-                                               error("Invalid UTF-8 string (denormalized multibyte sequence)");
-                                               break;
-                                       }
-                               }
-                       }
-                       else
-                       {
-                               append(*i++);
-                               break;
-                       }
-               }
-               else
-               {
-                       if((*i&0xC0)!=0x80)
-                       {
-                               error("Invalid UTF-8 string (head byte when expecting tail)");
-                               ++i;
-                               break;
-                       }
+               UnicodeChar result=error("UTF-8 tail byte found when expecting head");
+               ++i;
+               return result;
+       }
+       else if(*i&0x80)
+       {
+               unsigned bytes=2;
+               unsigned mask=0x20;
+               for(; *i&mask; mask>>=1)
+                       ++bytes;
  
-                       code=code<<6 | (*i++)&0x3F;
-                       --bytes;
+               string::const_iterator j=i;
  
-                       if(!bytes)
+               UnicodeChar result=(*j++)&(mask-1);
+
+               unsigned k;
+               for(k=1; (k<bytes && j!=str.end()); ++k)
+               {
+                       if((*j&0xC0)!=0x80)
                         {
-                               if(code>0x10FFFF)
-                                       error("Invalid UTF-8 string (character out of range)");
-                               else
-                                       append(code);
-                               break;
+                               result=error("Incomplete UTF-8 character");
+                               i=j;
+                               return result;
                         }
+                       result=result<<6 | (*j++)&0x3F;
                 }
-       }
-}
  
-void Utf8::Decoder::sync()
-{
-       if(bytes)
-       {
-               error("Sync in the middle of multibyte UTF-8 sequence");
-               bytes=0;
-       }
-}
+               if(k<bytes)
+                       result=error("Incomplete UTF-8 character at end of input");
+               else if(!result>>(bytes*6-6))
+                       result=error("Denormalized UTF-8 multibyte sequence");
+               else if(result>0x10FFFF)
+                       result=error("Invalid Unicode code point");
  
-void Utf8::Decoder::reset()
-{
-       bytes=0;
+               i=j;
+               return result;
+       }
+       else
+               return *i++;
  }
  
+} // namespace Codecs
  } // namespace Msp