From: Mikko Rasa Date: Thu, 25 Dec 2008 08:57:28 +0000 (+0000) Subject: Make codecs able to tell their name X-Git-Tag: strings-1.1~7 X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=commitdiff_plain;h=271ffa9434c8d9397bb5170cf1ee670c5265ec60 Make codecs able to tell their name Support autodetecting the correct codec for a string --- diff --git a/source/ascii.h b/source/ascii.h index 7dc8762..ecff0db 100644 --- a/source/ascii.h +++ b/source/ascii.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "ASCII"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/codec.cpp b/source/codec.cpp index 865176e..521dfea 100644 --- a/source/codec.cpp +++ b/source/codec.cpp @@ -102,9 +102,53 @@ Codec *create_codec(const string &n) if(name=="jisx0201") return new JisX0201; if(name=="jisx0208") return new JisX0208; if(name=="utf8") return new Utf8; - if(name=="windows1252") return new Windows1252; + if(name=="windows1252" || name=="cp1252") return new Windows1252; throw InvalidParameterValue("Unknown string codec"); } +Codec *detect_codec(const string &str) +{ + bool is_utf8=true; + bool is_ascii=true; + bool is_latin1=true; + unsigned utf8_mb=0; + + for(string::const_iterator i=str.begin(); i!=str.end(); ++i) + { + unsigned char c=*i; + if(c&0x80) + { + is_ascii=false; + if((c&0xC0)==0x80) + { + if((c&0xE0)==0x80) + is_latin1=false; + if(utf8_mb) + --utf8_mb; + else + is_utf8=false; + } + else if((c&0xC0)==0xC0) + { + if(utf8_mb) + is_utf8=false; + else + { + for(utf8_mb=1; (c>>(6-utf8_mb))&1; ++utf8_mb) ; + } + } + } + } + + if(is_ascii) + return new Ascii; + else if(is_utf8) + return new Utf8; + else if(is_latin1) + return new Iso88591; + else + return new Windows1252; +} + } // namespace Codecs } // namespace Msp diff --git a/source/codec.h b/source/codec.h index bb35b0b..d373740 100644 --- a/source/codec.h +++ b/source/codec.h @@ -160,6 +160,11 @@ public: virtual ~Codec() { } + /** + Returns the name of the encoding handled by this codec. + */ + virtual const char *get_name() const =0; + /** Creates an encoder for this codec. */ @@ -231,6 +236,12 @@ deleting the codec when it's no longer needed. */ Codec *create_codec(const std::string &); +/** +Automatically detects the encoding of a string and creates a codec for it. +The codec must be deleted when it's no longer needed. +*/ +Codec *detect_codec(const std::string &); + } // namespace Codecs } // namespace Msp diff --git a/source/iso2022jp.h b/source/iso2022jp.h index f3c2736..4a452d4 100644 --- a/source/iso2022jp.h +++ b/source/iso2022jp.h @@ -50,8 +50,10 @@ public: virtual void switch_mode(Mode); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "ISO-2022-JP"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/iso646fi.h b/source/iso646fi.h index 8f38a29..3047edf 100644 --- a/source/iso646fi.h +++ b/source/iso646fi.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "ISO-646-FI"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/iso88591.h b/source/iso88591.h index 1adf605..80c706b 100644 --- a/source/iso88591.h +++ b/source/iso88591.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "ISO-8859-1"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/iso885915.h b/source/iso885915.h index da26922..c42ad19 100644 --- a/source/iso885915.h +++ b/source/iso885915.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "ISO-8859-15"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/jisx0201.h b/source/jisx0201.h index 1c502f9..180f710 100644 --- a/source/jisx0201.h +++ b/source/jisx0201.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "JIS X 0201"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/jisx0208.h b/source/jisx0208.h index 9e9ff9b..27609e6 100644 --- a/source/jisx0208.h +++ b/source/jisx0208.h @@ -37,8 +37,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "JIS X 0208"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; struct Kuten diff --git a/source/utf8.h b/source/utf8.h index 70d44b0..0e5e068 100644 --- a/source/utf8.h +++ b/source/utf8.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "UTF-8"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs diff --git a/source/windows1252.h b/source/windows1252.h index 9f0fe0b..f817a0d 100644 --- a/source/windows1252.h +++ b/source/windows1252.h @@ -32,8 +32,10 @@ public: virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } - Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } + virtual const char *get_name() const { return "Windows-1252"; } + + virtual Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + virtual Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Codecs