X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fcodec.h;h=bed3d93cb2c3e1f245492c4eec43e8db4ef65c58;hp=b194c3f46b56b4be503ff63e2741f8243f43c3fa;hb=f47bc86e6ce900c5323e593db003c93110538268;hpb=79d472ad3fde75de2eba2487579b047d35e56978 diff --git a/source/codec.h b/source/codec.h index b194c3f..bed3d93 100644 --- a/source/codec.h +++ b/source/codec.h @@ -12,6 +12,18 @@ Distributed under the LGPL #include namespace Msp { +namespace Codecs { + +typedef int UnicodeChar; + +typedef std::basic_string ustring; + +enum ErrorMode +{ + THROW_ON_ERROR, + IGNORE_ERRORS, + TRANSLITERATE +}; /** An exception thrown for all kinds of problems encountered while encoding or @@ -27,21 +39,14 @@ public: Base class for string codecs. Use one of the derived classes or the function create_codec to create a specific codec. -For the purposes of this class, an std::wstring is considered to contain +For the purposes of this class, an ustring is considered to contain Unicode characters and an std::string is considered to be an encoded sequence of bytes. A codec is able to determine if an encoded string could be decoded with it. */ -class StringCodec +class Codec { public: - enum ErrorMode - { - THROW_ON_ERROR, - IGNORE_ERRORS, - REPLACE_ERRORS - }; - /** Base class for string encoder. Each codec class should contain an Encoder class derived from this. @@ -49,51 +54,18 @@ public: class Encoder { public: - /** - Encodes a single character. Derived classes should use the append - function to put the result into the internal buffer. - */ - virtual void encode_char(wchar_t) =0; - - /** - Encodes a string. - */ - virtual void encode(const std::wstring &s) - { for(std::wstring::const_iterator i=s.begin(); i!=s.end(); ++i) encode_char(*i); } - - /** - Brings the encoder back to its initial state. This allows the encoded - sequence to be extracted or flushed without loss of integrity. - */ - virtual void sync() { } - - /** - Returns a reference to the encoded sequence. Call sync() first to make - sure it's a valid encoded string by itself. - */ - const std::string &get() const { return buffer_; } - - /** - Returns the number of bytes in the output buffer. - */ - unsigned size() const { return buffer_.size(); } - - /** - Clears the encoded sequence. Encoder state is left intact. - */ - void flush() { buffer_.clear(); } - virtual ~Encoder() { } + + virtual void encode_char(UnicodeChar ch, std::string &buf) =0; + virtual void encode(const ustring &str, std::string &buf); + virtual void sync(std::string &buf) { (void)buf; } + virtual void reset() { } protected: - Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { } - void append(char c) { buffer_+=c; } - void append(const char *s, unsigned l) { buffer_.append(s, l); } - void append(const std::string &s) { buffer_+=s; } - void error(const std::string &); - virtual void append_replacement() { } - private: - ErrorMode err_mode_; - std::string buffer_; + ErrorMode err_mode; + + Encoder(ErrorMode em): err_mode(em) { } + void error(UnicodeChar, std::string &, const std::string &); + virtual void transliterate(UnicodeChar, std::string &) { } }; /** @@ -103,57 +75,79 @@ public: class Decoder { public: - virtual void decode_char(const std::string &, std::string::const_iterator &) =0; - virtual void decode(const std::string &s) - { for(std::string::const_iterator i=s.begin(); i!=s.end(); ) decode_char(s, i); } + virtual ~Decoder() { } /** - Ensures that all input has been processed. If this is not the case any - buffers are cleared and an error is triggered. + Decodes a single character from a string. The iterator is advanced to + the next character. For stateful codecs, -1 may be returned if a state + change sequence was decoded but no character followed it. In case a + decoding error occurs, behavior depends on the error mode specified for + the decoder: + + For THROW_ON_ERROR, an exception is thrown and the iterator is left at + the erroneous character. + + For IGNORE_ERRORS, -1 is returned and the iterator is advanced. + + For TRANSLITERATE, 0xFFFE is returned and the iterator is advanced. */ - virtual void sync() { } + virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0; - const std::wstring &get() const { return buffer_; } - unsigned size() const { return buffer_.size(); } - void flush() { buffer_.clear(); } - virtual ~Decoder() { } + virtual void decode(const std::string &str, ustring &buf); + virtual void reset() { } protected: - Decoder(ErrorMode em): err_mode_(em) { } - void append(wchar_t c) { buffer_+=c; } - void append(const std::wstring &s) { buffer_+=s; } - void error(const std::string &); - private: - ErrorMode err_mode_; - std::wstring buffer_; + ErrorMode err_mode; + + Decoder(ErrorMode em): err_mode(em) { } + UnicodeChar error(const std::string &); }; - virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0; - virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0; - virtual bool detect(const std::string &) const; - virtual ~StringCodec() { } + virtual ~Codec() { } + + /** + Creates an encoder for this codec. + */ + virtual Encoder *create_encoder(ErrorMode err_mode=THROW_ON_ERROR) const =0; + + /** + Creates a decoder for this codec. + */ + virtual Decoder *create_decoder(ErrorMode err_mode=THROW_ON_ERROR) const =0; + + /** + Determines whether the given string can be successfully decoded with this + codec. Note that this function returning true does not guarantee that the + string was actually encoded with this codec. In particular, many 8-bit + encodings are indistinguishable. + */ + virtual bool detect(const std::string &) const; protected: - StringCodec() { } + Codec() { } }; +typedef Codec::Encoder Encoder; +typedef Codec::Decoder Decoder; + /** Convenience function that decodes a string using the given codec. */ template -std::wstring decode(const std::string &s) +ustring decode(const std::string &s) { typename C::Decoder dec; - dec.decode(s); - dec.sync(); - return dec.get(); + ustring result; + dec.decode(s, result); + return result; } template -std::string encode(const std::wstring &s) +std::string encode(const ustring &s) { typename C::Encoder enc; - enc.encode(s); - enc.sync(); - return enc.get(); + std::string result; + enc.encode(s, result); + enc.sync(result); + return result; } template @@ -161,15 +155,21 @@ std::string transcode(const std::string &s) { typename F::Decoder from; typename T::Encoder to; - from.decode(s); - from.sync(); - to.encode(from.get()); - to.sync(); - return to.get(); + ustring temp; + from.decode(s, temp); + std::string result; + to.encode(temp, result); + to.sync(result); + return result; } -StringCodec *create_codec(const std::string &); +/** +Creates a codec for an encoding by name. The caller is responsible for +deleting the codec when it's no longer needed. +*/ +Codec *create_codec(const std::string &); +} // namespace Codecs } // namespace Msp #endif