X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fcodec.cpp;h=42315e52e9f7179540bcb5836021328f470cee2b;hp=96d18c7c922ac51aaa14bbec8138b0b8090a98bd;hb=0ac97daad8d24f6762e1b870fec782c7aace695a;hpb=6cbf9d2160a9f8e1ef98acb63ead3a14c88e2703 diff --git a/source/codec.cpp b/source/codec.cpp index 96d18c7..42315e5 100644 --- a/source/codec.cpp +++ b/source/codec.cpp @@ -10,6 +10,7 @@ Distributed under the LGPL #include "iso2022jp.h" #include "iso646fi.h" #include "iso88591.h" +#include "iso885915.h" #include "jisx0201.h" #include "jisx0208.h" #include "utf8.h" @@ -18,14 +19,9 @@ Distributed under the LGPL using namespace std; namespace Msp { +namespace Codecs { -/** -Determines whether the given string can be successfully decoded with this -codec. Note that this function returning true does not guarantee that the -string was actually encoded with this codec. In particular, many 8-bit -encodings are indistinguishable. -*/ -bool StringCodec::detect(const string &str) const +bool Codec::detect(const string &str) const { Decoder *dec=create_decoder(); bool result=true; @@ -33,7 +29,6 @@ bool StringCodec::detect(const string &str) const { for(string::const_iterator i=str.begin(); i!=str.end(); ) dec->decode_char(str, i); - dec->sync(); } catch(const CodecError &) { @@ -45,31 +40,65 @@ bool StringCodec::detect(const string &str) const return result; } -void StringCodec::Encoder::error(const string &msg) +void Codec::Encoder::encode(const ustring &str, string &buf) +{ + for(ustring::const_iterator i=str.begin(); i!=str.end(); ++i) + encode_char(*i, buf); +} + +string Codec::Encoder::encode(const ustring &str) +{ + string buf; + encode(str, buf); + sync(buf); + return buf; +} + +void Codec::Encoder::error(UnicodeChar ch, string &buf, const string &msg) { - switch(err_mode_) + switch(err_mode) { - case IGNORE_ERRORS: break; - case REPLACE_ERRORS: append_replacement(); break; - default: throw CodecError(msg); + case TRANSLITERATE: + transliterate(ch, buf); + case IGNORE_ERRORS: + break; + default: + throw CodecError(msg); } } -void StringCodec::Decoder::error(const string &msg) + +void Codec::Decoder::decode(const string &str, ustring &buf) { - switch(err_mode_) + for(string::const_iterator i=str.begin(); i!=str.end();) { - case IGNORE_ERRORS: break; - case REPLACE_ERRORS: append(0xFFFD); break; - default: throw CodecError(msg); + UnicodeChar c=decode_char(str, i); + if(c!=-1) + buf+=c; } } -/** -Creates a codec for the given encoding. The caller is responsible for deleting -the codec when it's no longer needed. -*/ -StringCodec *create_codec(const string &n) +ustring Codec::Decoder::decode(const string &str) +{ + ustring buf; + decode(str, buf); + return buf; +} + +UnicodeChar Codec::Decoder::error(const string &msg) +{ + switch(err_mode) + { + case TRANSLITERATE: + return 0xFFFE; + case IGNORE_ERRORS: + return -1; + default: + throw CodecError(msg); + } +} + +Codec *create_codec(const string &n) { string name; for(string::const_iterator i=n.begin(); i!=n.end(); ++i) @@ -84,11 +113,65 @@ StringCodec *create_codec(const string &n) if(name=="iso2022jp") return new Iso2022Jp; if(name=="iso646fi") return new Iso646Fi; if(name=="iso88591" || name=="latin1") return new Iso88591; + if(name=="iso885915" || name=="latin9") return new Iso885915; if(name=="jisx0201") return new JisX0201; if(name=="jisx0208") return new JisX0208; if(name=="utf8") return new Utf8; - if(name=="windows1252") return new Windows1252; + if(name=="windows1252" || name=="cp1252") return new Windows1252; throw InvalidParameterValue("Unknown string codec"); } +Codec *detect_codec(const string &str) +{ + bool is_utf8=true; + bool is_ascii=true; + bool is_latin1=true; + unsigned utf8_mb=0; + + for(string::const_iterator i=str.begin(); i!=str.end(); ++i) + { + unsigned char c=*i; + if(c&0x80) + { + is_ascii=false; + if((c&0xC0)==0x80) + { + if((c&0xE0)==0x80) + is_latin1=false; + if(utf8_mb) + --utf8_mb; + else + is_utf8=false; + } + else if((c&0xC0)==0xC0) + { + if(utf8_mb) + { + is_utf8=false; + utf8_mb=0; + } + else + { + for(utf8_mb=1; (c>>(6-utf8_mb))&1; ++utf8_mb) ; + } + } + } + else if(utf8_mb) + { + is_utf8=false; + utf8_mb=0; + } + } + + if(is_ascii) + return new Ascii; + else if(is_utf8) + return new Utf8; + else if(is_latin1) + return new Iso88591; + else + return new Windows1252; +} + +} // namespace Codecs } // namespace Msp