]> git.tdb.fi Git - libs/core.git/blobdiff - source/codec.cpp
Add convenience functions to Encoder and Decoder to process a string and return the...
[libs/core.git] / source / codec.cpp
index 96d18c7c922ac51aaa14bbec8138b0b8090a98bd..42315e52e9f7179540bcb5836021328f470cee2b 100644 (file)
@@ -10,6 +10,7 @@ Distributed under the LGPL
 #include "iso2022jp.h"
 #include "iso646fi.h"
 #include "iso88591.h"
+#include "iso885915.h"
 #include "jisx0201.h"
 #include "jisx0208.h"
 #include "utf8.h"
@@ -18,14 +19,9 @@ Distributed under the LGPL
 using namespace std;
 
 namespace Msp {
+namespace Codecs {
 
-/**
-Determines whether the given string can be successfully decoded with this
-codec.  Note that this function returning true does not guarantee that the
-string was actually encoded with this codec.  In particular, many 8-bit
-encodings are indistinguishable.
-*/
-bool StringCodec::detect(const string &str) const
+bool Codec::detect(const string &str) const
 {
        Decoder *dec=create_decoder();
        bool result=true;
@@ -33,7 +29,6 @@ bool StringCodec::detect(const string &str) const
        {
                for(string::const_iterator i=str.begin(); i!=str.end(); )
                        dec->decode_char(str, i);
-               dec->sync();
        }
        catch(const CodecError &)
        {
@@ -45,31 +40,65 @@ bool StringCodec::detect(const string &str) const
        return result;
 }
 
-void StringCodec::Encoder::error(const string &msg)
+void Codec::Encoder::encode(const ustring &str, string &buf)
+{
+       for(ustring::const_iterator i=str.begin(); i!=str.end(); ++i)
+               encode_char(*i, buf);
+}
+
+string Codec::Encoder::encode(const ustring &str)
+{
+       string buf;
+       encode(str, buf);
+       sync(buf);
+       return buf;
+}
+
+void Codec::Encoder::error(UnicodeChar ch, string &buf, const string &msg)
 {
-       switch(err_mode_)
+       switch(err_mode)
        {
-       case IGNORE_ERRORS: break;
-       case REPLACE_ERRORS: append_replacement(); break;
-       default: throw CodecError(msg);
+       case TRANSLITERATE:
+               transliterate(ch, buf);
+       case IGNORE_ERRORS:
+               break;
+       default:
+               throw CodecError(msg);
        }
 }
 
-void StringCodec::Decoder::error(const string &msg)
+
+void Codec::Decoder::decode(const string &str, ustring &buf)
 {
-       switch(err_mode_)
+       for(string::const_iterator i=str.begin(); i!=str.end();)
        {
-       case IGNORE_ERRORS: break;
-       case REPLACE_ERRORS: append(0xFFFD); break;
-       default: throw CodecError(msg);
+               UnicodeChar c=decode_char(str, i);
+               if(c!=-1)
+                       buf+=c;
        }
 }
 
-/**
-Creates a codec for the given encoding.  The caller is responsible for deleting
-the codec when it's no longer needed.
-*/
-StringCodec *create_codec(const string &n)
+ustring Codec::Decoder::decode(const string &str)
+{
+       ustring buf;
+       decode(str, buf);
+       return buf;
+}
+
+UnicodeChar Codec::Decoder::error(const string &msg)
+{
+       switch(err_mode)
+       {
+       case TRANSLITERATE:
+               return 0xFFFE;
+       case IGNORE_ERRORS:
+               return -1;
+       default:
+               throw CodecError(msg);
+       }
+}
+
+Codec *create_codec(const string &n)
 {
        string name;
        for(string::const_iterator i=n.begin(); i!=n.end(); ++i)
@@ -84,11 +113,65 @@ StringCodec *create_codec(const string &n)
        if(name=="iso2022jp") return new Iso2022Jp;
        if(name=="iso646fi") return new Iso646Fi;
        if(name=="iso88591" || name=="latin1") return new Iso88591;
+       if(name=="iso885915" || name=="latin9") return new Iso885915;
        if(name=="jisx0201") return new JisX0201;
        if(name=="jisx0208") return new JisX0208;
        if(name=="utf8") return new Utf8;
-       if(name=="windows1252") return new Windows1252;
+       if(name=="windows1252" || name=="cp1252") return new Windows1252;
        throw InvalidParameterValue("Unknown string codec");
 }
 
+Codec *detect_codec(const string &str)
+{
+       bool is_utf8=true;
+       bool is_ascii=true;
+       bool is_latin1=true;
+       unsigned utf8_mb=0;
+
+       for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
+       {
+               unsigned char c=*i;
+               if(c&0x80)
+               {
+                       is_ascii=false;
+                       if((c&0xC0)==0x80)
+                       {
+                               if((c&0xE0)==0x80)
+                                       is_latin1=false;
+                               if(utf8_mb)
+                                       --utf8_mb;
+                               else
+                                       is_utf8=false;
+                       }
+                       else if((c&0xC0)==0xC0)
+                       {
+                               if(utf8_mb)
+                               {
+                                       is_utf8=false;
+                                       utf8_mb=0;
+                               }
+                               else
+                               {
+                                       for(utf8_mb=1; (c>>(6-utf8_mb))&1; ++utf8_mb) ;
+                               }
+                       }
+               }
+               else if(utf8_mb)
+               {
+                       is_utf8=false;
+                       utf8_mb=0;
+               }
+       }
+
+       if(is_ascii)
+               return new Ascii;
+       else if(is_utf8)
+               return new Utf8;
+       else if(is_latin1)
+               return new Iso88591;
+       else
+               return new Windows1252;
+}
+
+} // namespace Codecs
 } // namespace Msp