Base class for string codecs. Use one of the derived classes or the function
create_codec to create a specific codec.
-For the purposes of this class, an ustring is considered to contain
-Unicode characters and an std::string is considered to be an encoded sequence
-of bytes. A codec is able to determine if an encoded string could be decoded
-with it.
+Unicode strings are represented as ustrings. An std::string is considered to
+be an encoded sequence of bytes. A codec is able to determine if an encoded
+string could be decoded with it.
*/
class Codec
{
public:
/**
- Base class for string encoder. Each codec class should contain an Encoder
- class derived from this.
+ Base class for string encoder.
+
+ Each codec class should contain an Encoder class derived from this. The
+ encode_char and transliterate functions must be overloaded. Some encoders
+ may find it useful or necessary to implement some other functions too
+ (particularly sync and reset for stateful codecs).
*/
class Encoder
{
public:
virtual ~Encoder() { }
+ /**
+ Encodes a single unicode character. If the character can't be
+ represented in this encoding, behavior depends on the error mode
+ specified for the encoder:
+
+ For THROW_ON_ERROR, an exception is thrown.
+
+ For IGNORE_ERRORS, nothing is done.
+
+ For TRANSLITERATE, the encoder attempts to select a character or a string
+ or characters that closely approximates the non-representable character.
+ */
virtual void encode_char(UnicodeChar ch, std::string &buf) =0;
+
+ /**
+ Encodes a unicode string. This is equivalent to callind encode_char for
+ each character in the string with the same buffer.
+ */
virtual void encode(const ustring &str, std::string &buf);
+
+ /**
+ Procuces a sequence of bytes that will bring the encoder back to the
+ initial state.
+ */
virtual void sync(std::string &buf) { (void)buf; }
+
+ /**
+ Resets the encoder to the initial state without producing output.
+ */
virtual void reset() { }
protected:
ErrorMode err_mode;
Encoder(ErrorMode em): err_mode(em) { }
- void error(UnicodeChar, std::string &, const std::string &);
- virtual void transliterate(UnicodeChar, std::string &) { }
+
+ /**
+ Handles an error depending on the error mode.
+
+ For THROW_ON_ERROR, throws CodecError(msg).
+
+ For IGNORE_ERROR, does nothing.
+
+ For TRANSLITERATE, calls transliterate(ch, buf).
+ */
+ void error(UnicodeChar ch, std::string &buf, const std::string &msg);
+
+ /**
+ Attempts to produce an alternative encoding for a unicode character.
+ Typically this includes dropping accent marks or romanizing letters.
+ */
+ virtual void transliterate(UnicodeChar ch, std::string &buf) =0;
};
/**
- Base class for string decoder. Each codec class should contain an Decoder
- class derived from this.
+ Base class for string decoder.
+
+ Each codec class should contain an Decoder class derived from this.
*/
class Decoder
{
*/
virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0;
+ /**
+ Decodes a string.
+ */
virtual void decode(const std::string &str, ustring &buf);
+
+ /**
+ Resets the decoder to the initial state.
+ */
virtual void reset() { }
protected:
ErrorMode err_mode;
Decoder(ErrorMode em): err_mode(em) { }
+
+ /**
+ Handles an error depending on the error mode.
+ */
UnicodeChar error(const std::string &);
};
typedef Codec::Decoder Decoder;
/**
-Convenience function that decodes a string using the given codec.
+Convenience function that decodes a string.
*/
template<class C>
ustring decode(const std::string &s)
return result;
}
+/**
+Convenience function that encodes a string.
+*/
template<class C>
std::string encode(const ustring &s)
{
return result;
}
+/**
+Convenience function that transcodes a string from one codec to another.
+*/
template<class F, class T>
std::string transcode(const std::string &s)
{