X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fcodec.h;h=bb35b0b43528ca4eba14dc66682f43ae26bca651;hp=1f012710f636f0acc208aff0bc87ab3edad8ab5e;hb=b254c706617223da1dd1b9543a74715e42a8a5b0;hpb=9da6abdcabec59f4845da256a8ad75a810ed1589 diff --git a/source/codec.h b/source/codec.h index 1f01271..bb35b0b 100644 --- a/source/codec.h +++ b/source/codec.h @@ -9,9 +9,21 @@ Distributed under the LGPL #define MSP_STRINGS_CODEC_H_ #include -#include +#include namespace Msp { +namespace Codecs { + +typedef int UnicodeChar; + +typedef std::basic_string ustring; + +enum ErrorMode +{ + THROW_ON_ERROR, + IGNORE_ERRORS, + TRANSLITERATE +}; /** An exception thrown for all kinds of problems encountered while encoding or @@ -27,149 +39,199 @@ public: Base class for string codecs. Use one of the derived classes or the function create_codec to create a specific codec. -For the purposes of this class, an std::wstring is considered to contain -Unicode characters and an std::string is considered to be an encoded sequence -of bytes. A codec is able to determine if an encoded string could be decoded -with it. +Unicode strings are represented as ustrings. An std::string is considered to +be an encoded sequence of bytes. A codec is able to determine if an encoded +string could be decoded with it. */ -class StringCodec +class Codec { public: - enum ErrorMode - { - THROW_ON_ERROR, - IGNORE_ERRORS, - REPLACE_ERRORS - }; - /** - Base class for string encoder. Each codec class should contain an Encoder - class derived from this. + Base class for string encoder. + + Each codec class should contain an Encoder class derived from this. The + encode_char and transliterate functions must be overloaded. Some encoders + may find it useful or necessary to implement some other functions too + (particularly sync and reset for stateful codecs). */ class Encoder { public: + virtual ~Encoder() { } + /** - Encodes a single character. Derived classes should use the append - function to put the result into the internal buffer. + Encodes a single unicode character. If the character can't be + represented in this encoding, behavior depends on the error mode + specified for the encoder: + + For THROW_ON_ERROR, an exception is thrown. + + For IGNORE_ERRORS, nothing is done. + + For TRANSLITERATE, the encoder attempts to select a character or a string + or characters that closely approximates the non-representable character. */ - virtual void encode_char(wchar_t) =0; + virtual void encode_char(UnicodeChar ch, std::string &buf) =0; /** - Encodes a string. + Encodes a unicode string. This is equivalent to callind encode_char for + each character in the string with the same buffer. */ - virtual void encode(const std::wstring &s) - { for(std::wstring::const_iterator i=s.begin(); i!=s.end(); ++i) encode_char(*i); } + virtual void encode(const ustring &str, std::string &buf); /** - Brings the encoder back to its initial state. This allows the encoded - sequence to be extracted or flushed without loss of integrity. + Procuces a sequence of bytes that will bring the encoder back to the + initial state. */ - virtual void sync() { } + virtual void sync(std::string &buf) { (void)buf; } /** - Returns a reference to the encoded sequence. Call sync() first to make - sure it's a valid encoded string by itself. + Resets the encoder to the initial state without producing output. */ - const std::string &get() const { return buffer_; } + virtual void reset() { } + protected: + ErrorMode err_mode; + + Encoder(ErrorMode em): err_mode(em) { } /** - Returns the number of bytes in the output buffer. + Handles an error depending on the error mode. + + For THROW_ON_ERROR, throws CodecError(msg). + + For IGNORE_ERROR, does nothing. + + For TRANSLITERATE, calls transliterate(ch, buf). */ - unsigned size() const { return buffer_.size(); } + void error(UnicodeChar ch, std::string &buf, const std::string &msg); /** - Clears the encoded sequence. Encoder state is left intact. + Attempts to produce an alternative encoding for a unicode character. + Typically this includes dropping accent marks or romanizing letters. */ - void flush() { buffer_.clear(); } - - virtual ~Encoder() { } - protected: - Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { } - void append(char c) { buffer_+=c; } - void append(const char *s, unsigned l) { buffer_.append(s, l); } - void append(const std::string &s) { buffer_+=s; } - void error(const std::string &); - virtual void append_replacement() { } - private: - ErrorMode err_mode_; - std::string buffer_; + virtual void transliterate(UnicodeChar ch, std::string &buf) =0; }; /** - Base class for string decoder. Each codec class should contain an Decoder - class derived from this. + Base class for string decoder. + + Each codec class should contain an Decoder class derived from this. */ class Decoder { public: - virtual void decode_char(const std::string &, std::string::const_iterator &) =0; - virtual void decode(const std::string &s) - { for(std::string::const_iterator i=s.begin(); i!=s.end(); ) decode_char(s, i); } + virtual ~Decoder() { } /** - Ensures that all input has been processed. If this is not the case any - buffers are cleared and an error is triggered. + Decodes a single character from a string. The iterator is advanced to + the next character. For stateful codecs, -1 may be returned if a state + change sequence was decoded but no character followed it. In case a + decoding error occurs, behavior depends on the error mode specified for + the decoder: + + For THROW_ON_ERROR, an exception is thrown and the iterator is left at + the erroneous character. + + For IGNORE_ERRORS, -1 is returned and the iterator is advanced. + + For TRANSLITERATE, 0xFFFE is returned and the iterator is advanced. */ - virtual void sync() { } + virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0; - const std::wstring &get() const { return buffer_; } - unsigned size() const { return buffer_.size(); } - void flush() { buffer_.clear(); } - virtual ~Decoder() { } + /** + Decodes a string. + */ + virtual void decode(const std::string &str, ustring &buf); + + /** + Resets the decoder to the initial state. + */ + virtual void reset() { } protected: - Decoder(ErrorMode em): err_mode_(em) { } - void append(wchar_t c) { buffer_+=c; } - void append(const std::wstring &s) { buffer_+=s; } - void error(const std::string &); - private: - ErrorMode err_mode_; - std::wstring buffer_; + ErrorMode err_mode; + + Decoder(ErrorMode em): err_mode(em) { } + + /** + Handles an error depending on the error mode. + */ + UnicodeChar error(const std::string &); }; - virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0; - virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0; - virtual bool detect(const std::string &) const; - virtual ~StringCodec() { } + virtual ~Codec() { } + + /** + Creates an encoder for this codec. + */ + virtual Encoder *create_encoder(ErrorMode err_mode=THROW_ON_ERROR) const =0; + + /** + Creates a decoder for this codec. + */ + virtual Decoder *create_decoder(ErrorMode err_mode=THROW_ON_ERROR) const =0; + + /** + Determines whether the given string can be successfully decoded with this + codec. Note that this function returning true does not guarantee that the + string was actually encoded with this codec. In particular, many 8-bit + encodings are indistinguishable. + */ + virtual bool detect(const std::string &) const; protected: - StringCodec() { } + Codec() { } }; +typedef Codec::Encoder Encoder; +typedef Codec::Decoder Decoder; + /** -Convenience function that decodes a string using the given codec. +Convenience function that decodes a string. */ template -std::wstring decode(const std::string &s) +ustring decode(const std::string &s) { typename C::Decoder dec; - dec.decode(s); - dec.sync(); - return dec.get(); + ustring result; + dec.decode(s, result); + return result; } +/** +Convenience function that encodes a string. +*/ template -std::string encode(const std::wstring &s) +std::string encode(const ustring &s) { typename C::Encoder enc; - enc.encode(s); - enc.sync(); - return enc.get(); + std::string result; + enc.encode(s, result); + enc.sync(result); + return result; } +/** +Convenience function that transcodes a string from one codec to another. +*/ template std::string transcode(const std::string &s) { typename F::Decoder from; typename T::Encoder to; - from.decode(s); - from.sync(); - to.encode(from.get()); - to.sync(); - return to.get(); + ustring temp; + from.decode(s, temp); + std::string result; + to.encode(temp, result); + to.sync(result); + return result; } -StringCodec *create_codec(const std::string &); +/** +Creates a codec for an encoding by name. The caller is responsible for +deleting the codec when it's no longer needed. +*/ +Codec *create_codec(const std::string &); +} // namespace Codecs } // namespace Msp #endif