3 This file is part of libmspstrings
4 Copyright © 2006-2007 Mikko Rasa
5 Distributed under the LGPL
8 #ifndef MSP_STRINGS_CODEC_H_
9 #define MSP_STRINGS_CODEC_H_
12 #include <msp/core/except.h>
17 typedef int UnicodeChar;
19 typedef std::basic_string<UnicodeChar> ustring;
29 An exception thrown for all kinds of problems encountered while encoding or
32 class CodecError: public Exception
35 CodecError(const std::string &w_): Exception(w_) { }
39 Base class for string codecs. Use one of the derived classes or the function
40 create_codec to create a specific codec.
42 Unicode strings are represented as ustrings. An std::string is considered to
43 be an encoded sequence of bytes. A codec is able to determine if an encoded
44 string could be decoded with it.
50 Base class for string encoder.
52 Each codec class should contain an Encoder class derived from this. The
53 encode_char and transliterate functions must be overloaded. Some encoders
54 may find it useful or necessary to implement some other functions too
55 (particularly sync and reset for stateful codecs).
60 virtual ~Encoder() { }
63 Encodes a single unicode character. If the character can't be
64 represented in this encoding, behavior depends on the error mode
65 specified for the encoder:
67 For THROW_ON_ERROR, an exception is thrown.
69 For IGNORE_ERRORS, nothing is done.
71 For TRANSLITERATE, the encoder attempts to select a character or a string
72 or characters that closely approximates the non-representable character.
74 virtual void encode_char(UnicodeChar ch, std::string &buf) =0;
77 Encodes a unicode string. This is equivalent to callind encode_char for
78 each character in the string with the same buffer.
80 virtual void encode(const ustring &str, std::string &buf);
82 std::string encode(const ustring &);
85 Procuces a sequence of bytes that will bring the encoder back to the
88 virtual void sync(std::string &buf) { (void)buf; }
91 Resets the encoder to the initial state without producing output.
93 virtual void reset() { }
97 Encoder(ErrorMode em): err_mode(em) { }
100 Handles an error depending on the error mode.
102 For THROW_ON_ERROR, throws CodecError(msg).
104 For IGNORE_ERROR, does nothing.
106 For TRANSLITERATE, calls transliterate(ch, buf).
108 void error(UnicodeChar ch, std::string &buf, const std::string &msg);
111 Attempts to produce an alternative encoding for a unicode character.
112 Typically this includes dropping accent marks or romanizing letters.
114 virtual void transliterate(UnicodeChar ch, std::string &buf) =0;
118 Base class for string decoder.
120 Each codec class should contain an Decoder class derived from this.
125 virtual ~Decoder() { }
128 Decodes a single character from a string. The iterator is advanced to
129 the next character. For stateful codecs, -1 may be returned if a state
130 change sequence was decoded but no character followed it. In case a
131 decoding error occurs, behavior depends on the error mode specified for
134 For THROW_ON_ERROR, an exception is thrown and the iterator is left at
135 the erroneous character.
137 For IGNORE_ERRORS, -1 is returned and the iterator is advanced.
139 For TRANSLITERATE, 0xFFFE is returned and the iterator is advanced.
141 virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0;
146 virtual void decode(const std::string &str, ustring &buf);
148 ustring decode(const std::string &);
151 Resets the decoder to the initial state.
153 virtual void reset() { }
157 Decoder(ErrorMode em): err_mode(em) { }
160 Handles an error depending on the error mode.
162 UnicodeChar error(const std::string &);
168 Returns the name of the encoding handled by this codec.
170 virtual const char *get_name() const =0;
173 Creates an encoder for this codec.
175 virtual Encoder *create_encoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
178 Creates a decoder for this codec.
180 virtual Decoder *create_decoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
183 Determines whether the given string can be successfully decoded with this
184 codec. Note that this function returning true does not guarantee that the
185 string was actually encoded with this codec. In particular, many 8-bit
186 encodings are indistinguishable.
188 virtual bool detect(const std::string &) const;
193 typedef Codec::Encoder Encoder;
194 typedef Codec::Decoder Decoder;
197 Convenience function that decodes a string.
200 ustring decode(const std::string &s)
202 typename C::Decoder dec;
204 dec.decode(s, result);
209 Convenience function that encodes a string.
212 std::string encode(const ustring &s)
214 typename C::Encoder enc;
216 enc.encode(s, result);
222 Convenience function that transcodes a string from one codec to another.
224 template<class F, class T>
225 std::string transcode(const std::string &s)
227 typename F::Decoder from;
228 typename T::Encoder to;
230 from.decode(s, temp);
232 to.encode(temp, result);
238 Creates a codec for an encoding by name. The caller is responsible for
239 deleting the codec when it's no longer needed.
241 Codec *create_codec(const std::string &);
244 Automatically detects the encoding of a string and creates a codec for it.
245 The codec must be deleted when it's no longer needed.
247 Codec *detect_codec(const std::string &);
249 } // namespace Codecs