3 This file is part of libmspstrings
4 Copyright © 2006-2007 Mikko Rasa
5 Distributed under the LGPL
8 #ifndef MSP_STRINGS_CODEC_H_
9 #define MSP_STRINGS_CODEC_H_
12 #include <msp/core/except.h>
17 typedef int UnicodeChar;
19 typedef std::basic_string<UnicodeChar> ustring;
29 An exception thrown for all kinds of problems encountered while encoding or
32 class CodecError: public Exception
35 CodecError(const std::string &w_): Exception(w_) { }
39 Base class for string codecs. Use one of the derived classes or the function
40 create_codec to create a specific codec.
42 Unicode strings are represented as ustrings. An std::string is considered to
43 be an encoded sequence of bytes. A codec is able to determine if an encoded
44 string could be decoded with it.
50 Base class for string encoder.
52 Each codec class should contain an Encoder class derived from this. The
53 encode_char and transliterate functions must be overloaded. Some encoders
54 may find it useful or necessary to implement some other functions too
55 (particularly sync and reset for stateful codecs).
60 virtual ~Encoder() { }
63 Encodes a single unicode character. If the character can't be
64 represented in this encoding, behavior depends on the error mode
65 specified for the encoder:
67 For THROW_ON_ERROR, an exception is thrown.
69 For IGNORE_ERRORS, nothing is done.
71 For TRANSLITERATE, the encoder attempts to select a character or a string
72 or characters that closely approximates the non-representable character.
74 virtual void encode_char(UnicodeChar ch, std::string &buf) =0;
77 Encodes a unicode string. This is equivalent to callind encode_char for
78 each character in the string with the same buffer.
80 virtual void encode(const ustring &str, std::string &buf);
83 Procuces a sequence of bytes that will bring the encoder back to the
86 virtual void sync(std::string &buf) { (void)buf; }
89 Resets the encoder to the initial state without producing output.
91 virtual void reset() { }
95 Encoder(ErrorMode em): err_mode(em) { }
98 Handles an error depending on the error mode.
100 For THROW_ON_ERROR, throws CodecError(msg).
102 For IGNORE_ERROR, does nothing.
104 For TRANSLITERATE, calls transliterate(ch, buf).
106 void error(UnicodeChar ch, std::string &buf, const std::string &msg);
109 Attempts to produce an alternative encoding for a unicode character.
110 Typically this includes dropping accent marks or romanizing letters.
112 virtual void transliterate(UnicodeChar ch, std::string &buf) =0;
116 Base class for string decoder.
118 Each codec class should contain an Decoder class derived from this.
123 virtual ~Decoder() { }
126 Decodes a single character from a string. The iterator is advanced to
127 the next character. For stateful codecs, -1 may be returned if a state
128 change sequence was decoded but no character followed it. In case a
129 decoding error occurs, behavior depends on the error mode specified for
132 For THROW_ON_ERROR, an exception is thrown and the iterator is left at
133 the erroneous character.
135 For IGNORE_ERRORS, -1 is returned and the iterator is advanced.
137 For TRANSLITERATE, 0xFFFE is returned and the iterator is advanced.
139 virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0;
144 virtual void decode(const std::string &str, ustring &buf);
147 Resets the decoder to the initial state.
149 virtual void reset() { }
153 Decoder(ErrorMode em): err_mode(em) { }
156 Handles an error depending on the error mode.
158 UnicodeChar error(const std::string &);
164 Creates an encoder for this codec.
166 virtual Encoder *create_encoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
169 Creates a decoder for this codec.
171 virtual Decoder *create_decoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
174 Determines whether the given string can be successfully decoded with this
175 codec. Note that this function returning true does not guarantee that the
176 string was actually encoded with this codec. In particular, many 8-bit
177 encodings are indistinguishable.
179 virtual bool detect(const std::string &) const;
184 typedef Codec::Encoder Encoder;
185 typedef Codec::Decoder Decoder;
188 Convenience function that decodes a string.
191 ustring decode(const std::string &s)
193 typename C::Decoder dec;
195 dec.decode(s, result);
200 Convenience function that encodes a string.
203 std::string encode(const ustring &s)
205 typename C::Encoder enc;
207 enc.encode(s, result);
213 Convenience function that transcodes a string from one codec to another.
215 template<class F, class T>
216 std::string transcode(const std::string &s)
218 typename F::Decoder from;
219 typename T::Encoder to;
221 from.decode(s, temp);
223 to.encode(temp, result);
229 Creates a codec for an encoding by name. The caller is responsible for
230 deleting the codec when it's no longer needed.
232 Codec *create_codec(const std::string &);
234 } // namespace Codecs