X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fcodec.h;h=e04e909fb7a1fc5c4750db7e13f33f2d44a96f92;hp=d5ba595b3b3ee765caa574479c766c92f27281c8;hb=5b1368cb791cab043f0435628cacbaff36e39b7b;hpb=ebd23ef7dde39a35e9ffdfb5be31934507cefaad diff --git a/source/codec.h b/source/codec.h index d5ba595..e04e909 100644 --- a/source/codec.h +++ b/source/codec.h @@ -1,10 +1,29 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2006-2007 Mikko Rasa +Distributed under the LGPL +*/ + #ifndef MSP_STRINGS_CODEC_H_ #define MSP_STRINGS_CODEC_H_ #include -#include +#include namespace Msp { +namespace Codecs { + +typedef int UnicodeChar; + +typedef std::basic_string ustring; + +enum ErrorMode +{ + THROW_ON_ERROR, + IGNORE_ERRORS, + TRANSLITERATE +}; /** An exception thrown for all kinds of problems encountered while encoding or @@ -20,137 +39,167 @@ public: Base class for string codecs. Use one of the derived classes or the function create_codec to create a specific codec. -For the purposes of this class, an std::wstring is considered to contain -Unicode characters and an std::string is considered to be an encoded sequence -of bytes. A codec is able to determine if an encoded string could be decoded -with it. +Unicode strings are represented as ustrings. An std::string is considered to +be an encoded sequence of bytes. A codec is able to determine if an encoded +string could be decoded with it. */ -class StringCodec +class Codec { public: /** - Base class for string encoder. Each codec class should contain an Encoder - class derived from this. + Base class for string encoder. + + Each codec class should contain an Encoder class derived from this. The + encode_char and transliterate functions must be overloaded. Some encoders + may find it useful or necessary to implement some other functions too + (particularly sync and reset for stateful codecs). */ class Encoder { - public: - /** - Encodes a single character. Derived classes should use the append - function to put the result into the internal buffer. - */ - virtual void encode_char(wchar_t) =0; - - /** - Encodes a string. - */ - virtual void encode(const std::wstring &s) - { for(std::wstring::const_iterator i=s.begin(); i!=s.end(); ++i) encode_char(*i); } - - /** - Brings the encoder back to its initial state. This allows the encoded - sequence to be extracted or flushed without loss of integrity. - */ - virtual void sync() { } - - /** - Returns a reference to the encoded sequence. Call sync() first to make - sure it's a valid encoded string by itself. - */ - const std::string &get() const { return buffer_; } - - /** - Returns the number of bytes in the output buffer. - */ - unsigned size() const { return buffer_.size(); } - - /** - Clears the encoded sequence. Encoder state is left intact. - */ - void flush() { buffer_.clear(); } + protected: + ErrorMode err_mode; + Encoder(ErrorMode em): err_mode(em) { } + public: virtual ~Encoder() { } + + /** Encodes a single unicode character. If the character can't be + represented in this encoding, error() should be called. */ + virtual void encode_char(UnicodeChar ch, std::string &buf) = 0; + + /** Encodes a unicode string. This is equivalent to calling encode_char + for each character in the string with the same buffer. */ + virtual void encode(const ustring &str, std::string &buf); + + std::string encode(const ustring &); + + /** Procuces a sequence of bytes that will bring the encoder back to the + initial state. */ + virtual void sync(std::string &buf) { (void)buf; } + + /** Resets the encoder to the initial state without producing output. */ + virtual void reset() { } + protected: - Encoder() { } - void append(char c) { buffer_+=c; } - void append(const char *s, unsigned l) { buffer_.append(s, l); } - void append(const std::string &s) { buffer_+=s; } - private: - std::string buffer_; + /** Handles an error depending on the error mode. + + THROW_ON_ERROR: throws CodecError(msg) + IGNORE_ERRORS: does nothing + TRANSLITERATE: calls transliterate(ch, buf) */ + void error(UnicodeChar ch, std::string &buf, const std::string &msg); + + /** Attempts to produce an alternative encoding for a unicode character. + Typically this includes dropping accent marks or romanizing letters. */ + virtual void transliterate(UnicodeChar ch, std::string &buf) = 0; }; /** - Base class for string decoder. Each codec class should contain an Decoder - class derived from this. + Base class for string decoder. + + Each codec class should contain an Decoder class derived from this. */ class Decoder { + protected: + ErrorMode err_mode; + + Decoder(ErrorMode em): err_mode(em) { } public: - virtual void decode_char(const std::string &, std::string::const_iterator &) =0; - virtual void decode(const std::string &s) - { for(std::string::const_iterator i=s.begin(); i!=s.end(); ) decode_char(s, i); } - - /** - Ensures that all input has been processed. An exception is thrown if - this is not the case. - */ - virtual void sync() { } - - const std::wstring &get() const { return buffer_; } - unsigned size() const { return buffer_.size(); } - void flush() { buffer_.clear(); } virtual ~Decoder() { } + + /** Decodes a single character from a string. The iterator is advanced + to the next character. For stateful codecs, -1 may be returned if a + state change sequence was decoded but no character followed it. If + invalid input is encountered, the error() function should be called and + the iterator advanced only if it doesn't throw. */ + virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) = 0; + + /** Decodes a string. */ + virtual void decode(const std::string &str, ustring &buf); + + ustring decode(const std::string &); + + /** Resets the decoder to the initial state. */ + virtual void reset() { } + protected: - Decoder() { } - void append(wchar_t c) { buffer_+=c; } - void append(const std::wstring &s) { buffer_+=s; } - private: - std::wstring buffer_; + /** Handles an error depending on the error mode. The return value is + suitable for returning from decode_char. + + THROW_ON_ERROR: throws CodecError(msg) + IGNORE_ERRORS: returns -1 + TRANSLITERATE: return 0xFFFE */ + UnicodeChar error(const std::string &msg); }; - virtual Encoder *create_encoder() const =0; - virtual Decoder *create_decoder() const =0; - virtual bool detect(const std::string &) const; - virtual ~StringCodec() { } protected: - StringCodec() { } + Codec() { } +public: + virtual ~Codec() { } + + /** Returns the name of the encoding handled by this codec. */ + virtual const char *get_name() const = 0; + + /** Creates an encoder for this codec. */ + virtual Encoder *create_encoder(ErrorMode err_mode = THROW_ON_ERROR) const = 0; + + /** Creates a decoder for this codec. */ + virtual Decoder *create_decoder(ErrorMode err_mode = THROW_ON_ERROR) const = 0; + + /** Determines whether the given string can be successfully decoded with + this codec. Note that this function returning true does not guarantee that + the string was actually encoded with this codec. In particular, many 8-bit + encodings are indistinguishable. */ + virtual bool detect(const std::string &) const; }; -/** -Convenience function that decodes a string using the given codec. -*/ +typedef Codec::Encoder Encoder; +typedef Codec::Decoder Decoder; + +/** Convenience function that decodes a string. */ template -std::wstring decode(const std::string &s) +ustring decode(const std::string &s) { typename C::Decoder dec; - dec.decode(s); - dec.sync(); - return dec.get(); + ustring result; + dec.decode(s, result); + return result; } +/** Convenience function that encodes a string. */ template -std::string encode(const std::wstring &s) +std::string encode(const ustring &s) { typename C::Encoder enc; - enc.encode(s); - enc.sync(); - return enc.get(); + std::string result; + enc.encode(s, result); + enc.sync(result); + return result; } +/** Convenience function that transcodes a string from one codec to another. */ template std::string transcode(const std::string &s) { typename F::Decoder from; typename T::Encoder to; - from.decode(s); - from.sync(); - to.encode(from.get()); - to.sync(); - return to.get(); + ustring temp; + from.decode(s, temp); + std::string result; + to.encode(temp, result); + to.sync(result); + return result; } -StringCodec *create_codec(const std::string &); +/** Creates a codec for an encoding by name. The caller is responsible for +deleting the codec when it's no longer needed. */ +Codec *create_codec(const std::string &); + +/** Automatically detects the encoding of a string and creates a codec for it. +The codec must be deleted when it's no longer needed. */ +Codec *detect_codec(const std::string &); +} // namespace Codecs } // namespace Msp #endif