#define MSP_STRINGS_CODEC_H_
#include <string>
-#include <msp/error.h>
+#include <msp/core/except.h>
namespace Msp {
+namespace Codecs {
+
+typedef int UnicodeChar;
+
+typedef std::basic_string<UnicodeChar> ustring;
+
+enum ErrorMode
+{
+ THROW_ON_ERROR,
+ IGNORE_ERRORS,
+ TRANSLITERATE
+};
/**
An exception thrown for all kinds of problems encountered while encoding or
Base class for string codecs. Use one of the derived classes or the function
create_codec to create a specific codec.
-For the purposes of this class, an std::wstring is considered to contain
-Unicode characters and an std::string is considered to be an encoded sequence
-of bytes. A codec is able to determine if an encoded string could be decoded
-with it.
+Unicode strings are represented as ustrings. An std::string is considered to
+be an encoded sequence of bytes. A codec is able to determine if an encoded
+string could be decoded with it.
*/
-class StringCodec
+class Codec
{
public:
- enum ErrorMode
- {
- THROW_ON_ERROR,
- IGNORE_ERRORS,
- REPLACE_ERRORS
- };
-
/**
- Base class for string encoder. Each codec class should contain an Encoder
- class derived from this.
+ Base class for string encoder.
+
+ Each codec class should contain an Encoder class derived from this. The
+ encode_char and transliterate functions must be overloaded. Some encoders
+ may find it useful or necessary to implement some other functions too
+ (particularly sync and reset for stateful codecs).
*/
class Encoder
{
public:
+ virtual ~Encoder() { }
+
/**
- Encodes a single character. Derived classes should use the append
- function to put the result into the internal buffer.
+ Encodes a single unicode character. If the character can't be
+ represented in this encoding, behavior depends on the error mode
+ specified for the encoder:
+
+ For THROW_ON_ERROR, an exception is thrown.
+
+ For IGNORE_ERRORS, nothing is done.
+
+ For TRANSLITERATE, the encoder attempts to select a character or a string
+ or characters that closely approximates the non-representable character.
*/
- virtual void encode_char(wchar_t) =0;
+ virtual void encode_char(UnicodeChar ch, std::string &buf) =0;
/**
- Encodes a string.
+ Encodes a unicode string. This is equivalent to callind encode_char for
+ each character in the string with the same buffer.
*/
- virtual void encode(const std::wstring &s)
- { for(std::wstring::const_iterator i=s.begin(); i!=s.end(); ++i) encode_char(*i); }
+ virtual void encode(const ustring &str, std::string &buf);
/**
- Brings the encoder back to its initial state. This allows the encoded
- sequence to be extracted or flushed without loss of integrity.
+ Procuces a sequence of bytes that will bring the encoder back to the
+ initial state.
*/
- virtual void sync() { }
+ virtual void sync(std::string &buf) { (void)buf; }
/**
- Returns a reference to the encoded sequence. Call sync() first to make
- sure it's a valid encoded string by itself.
+ Resets the encoder to the initial state without producing output.
*/
- const std::string &get() const { return buffer_; }
+ virtual void reset() { }
+ protected:
+ ErrorMode err_mode;
+
+ Encoder(ErrorMode em): err_mode(em) { }
/**
- Returns the number of bytes in the output buffer.
+ Handles an error depending on the error mode.
+
+ For THROW_ON_ERROR, throws CodecError(msg).
+
+ For IGNORE_ERROR, does nothing.
+
+ For TRANSLITERATE, calls transliterate(ch, buf).
*/
- unsigned size() const { return buffer_.size(); }
+ void error(UnicodeChar ch, std::string &buf, const std::string &msg);
/**
- Clears the encoded sequence. Encoder state is left intact.
+ Attempts to produce an alternative encoding for a unicode character.
+ Typically this includes dropping accent marks or romanizing letters.
*/
- void flush() { buffer_.clear(); }
-
- virtual ~Encoder() { }
- protected:
- Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { }
- void append(char c) { buffer_+=c; }
- void append(const char *s, unsigned l) { buffer_.append(s, l); }
- void append(const std::string &s) { buffer_+=s; }
- void error(const std::string &);
- virtual void append_replacement() { }
- private:
- ErrorMode err_mode_;
- std::string buffer_;
+ virtual void transliterate(UnicodeChar ch, std::string &buf) =0;
};
/**
- Base class for string decoder. Each codec class should contain an Decoder
- class derived from this.
+ Base class for string decoder.
+
+ Each codec class should contain an Decoder class derived from this.
*/
class Decoder
{
public:
- virtual void decode_char(const std::string &, std::string::const_iterator &) =0;
- virtual void decode(const std::string &s)
- { for(std::string::const_iterator i=s.begin(); i!=s.end(); ) decode_char(s, i); }
+ virtual ~Decoder() { }
/**
- Ensures that all input has been processed. If this is not the case any
- buffers are cleared and an error is triggered.
+ Decodes a single character from a string. The iterator is advanced to
+ the next character. For stateful codecs, -1 may be returned if a state
+ change sequence was decoded but no character followed it. In case a
+ decoding error occurs, behavior depends on the error mode specified for
+ the decoder:
+
+ For THROW_ON_ERROR, an exception is thrown and the iterator is left at
+ the erroneous character.
+
+ For IGNORE_ERRORS, -1 is returned and the iterator is advanced.
+
+ For TRANSLITERATE, 0xFFFE is returned and the iterator is advanced.
*/
- virtual void sync() { }
+ virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0;
- const std::wstring &get() const { return buffer_; }
- unsigned size() const { return buffer_.size(); }
- void flush() { buffer_.clear(); }
- virtual ~Decoder() { }
+ /**
+ Decodes a string.
+ */
+ virtual void decode(const std::string &str, ustring &buf);
+
+ /**
+ Resets the decoder to the initial state.
+ */
+ virtual void reset() { }
protected:
- Decoder(ErrorMode em): err_mode_(em) { }
- void append(wchar_t c) { buffer_+=c; }
- void append(const std::wstring &s) { buffer_+=s; }
- void error(const std::string &);
- private:
- ErrorMode err_mode_;
- std::wstring buffer_;
+ ErrorMode err_mode;
+
+ Decoder(ErrorMode em): err_mode(em) { }
+
+ /**
+ Handles an error depending on the error mode.
+ */
+ UnicodeChar error(const std::string &);
};
- virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0;
- virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0;
- virtual bool detect(const std::string &) const;
- virtual ~StringCodec() { }
+ virtual ~Codec() { }
+
+ /**
+ Creates an encoder for this codec.
+ */
+ virtual Encoder *create_encoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
+
+ /**
+ Creates a decoder for this codec.
+ */
+ virtual Decoder *create_decoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
+
+ /**
+ Determines whether the given string can be successfully decoded with this
+ codec. Note that this function returning true does not guarantee that the
+ string was actually encoded with this codec. In particular, many 8-bit
+ encodings are indistinguishable.
+ */
+ virtual bool detect(const std::string &) const;
protected:
- StringCodec() { }
+ Codec() { }
};
+typedef Codec::Encoder Encoder;
+typedef Codec::Decoder Decoder;
+
/**
-Convenience function that decodes a string using the given codec.
+Convenience function that decodes a string.
*/
template<class C>
-std::wstring decode(const std::string &s)
+ustring decode(const std::string &s)
{
typename C::Decoder dec;
- dec.decode(s);
- dec.sync();
- return dec.get();
+ ustring result;
+ dec.decode(s, result);
+ return result;
}
+/**
+Convenience function that encodes a string.
+*/
template<class C>
-std::string encode(const std::wstring &s)
+std::string encode(const ustring &s)
{
typename C::Encoder enc;
- enc.encode(s);
- enc.sync();
- return enc.get();
+ std::string result;
+ enc.encode(s, result);
+ enc.sync(result);
+ return result;
}
+/**
+Convenience function that transcodes a string from one codec to another.
+*/
template<class F, class T>
std::string transcode(const std::string &s)
{
typename F::Decoder from;
typename T::Encoder to;
- from.decode(s);
- from.sync();
- to.encode(from.get());
- to.sync();
- return to.get();
+ ustring temp;
+ from.decode(s, temp);
+ std::string result;
+ to.encode(temp, result);
+ to.sync(result);
+ return result;
}
-StringCodec *create_codec(const std::string &);
+/**
+Creates a codec for an encoding by name. The caller is responsible for
+deleting the codec when it's no longer needed.
+*/
+Codec *create_codec(const std::string &);
+} // namespace Codecs
} // namespace Msp
#endif