1 #ifndef MSP_STRINGS_CODEC_H_
2 #define MSP_STRINGS_CODEC_H_
5 #include <msp/core/except.h>
10 typedef int UnicodeChar;
12 typedef std::basic_string<UnicodeChar> ustring;
22 An exception thrown for all kinds of problems encountered while encoding or
25 class CodecError: public Exception
28 CodecError(const std::string &w_): Exception(w_) { }
32 Base class for string codecs. Use one of the derived classes or the function
33 create_codec to create a specific codec.
35 Unicode strings are represented as ustrings. An std::string is considered to
36 be an encoded sequence of bytes. A codec is able to determine if an encoded
37 string could be decoded with it.
43 Base class for string encoder.
45 Each codec class should contain an Encoder class derived from this. The
46 encode_char and transliterate functions must be overloaded. Some encoders
47 may find it useful or necessary to implement some other functions too
48 (particularly sync and reset for stateful codecs).
55 Encoder(ErrorMode em): err_mode(em) { }
57 virtual ~Encoder() { }
59 /** Encodes a single unicode character. If the character can't be
60 represented in this encoding, error() should be called. */
61 virtual void encode_char(UnicodeChar ch, std::string &buf) = 0;
63 /** Encodes a unicode string. This is equivalent to calling encode_char
64 for each character in the string with the same buffer. */
65 virtual void encode(const ustring &str, std::string &buf);
67 std::string encode(const ustring &);
69 /** Procuces a sequence of bytes that will bring the encoder back to the
71 virtual void sync(std::string &buf) { (void)buf; }
73 /** Resets the encoder to the initial state without producing output. */
74 virtual void reset() { }
77 /** Handles an error depending on the error mode.
79 THROW_ON_ERROR: throws CodecError(msg)
80 IGNORE_ERRORS: does nothing
81 TRANSLITERATE: calls transliterate(ch, buf) */
82 void error(UnicodeChar ch, std::string &buf, const std::string &msg);
84 /** Attempts to produce an alternative encoding for a unicode character.
85 Typically this includes dropping accent marks or romanizing letters. */
86 virtual void transliterate(UnicodeChar ch, std::string &buf) = 0;
90 Base class for string decoder.
92 Each codec class should contain an Decoder class derived from this.
99 Decoder(ErrorMode em): err_mode(em) { }
101 virtual ~Decoder() { }
103 /** Decodes a single character from a string. The iterator is advanced
104 to the next character. For stateful codecs, -1 may be returned if a
105 state change sequence was decoded but no character followed it. If
106 invalid input is encountered, the error() function should be called and
107 the iterator advanced only if it doesn't throw. */
108 virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) = 0;
110 /** Decodes a string. */
111 virtual void decode(const std::string &str, ustring &buf);
113 ustring decode(const std::string &);
115 /** Resets the decoder to the initial state. */
116 virtual void reset() { }
119 /** Handles an error depending on the error mode. The return value is
120 suitable for returning from decode_char.
122 THROW_ON_ERROR: throws CodecError(msg)
123 IGNORE_ERRORS: returns -1
124 TRANSLITERATE: return 0xFFFE */
125 UnicodeChar error(const std::string &msg);
133 /** Returns the name of the encoding handled by this codec. */
134 virtual const char *get_name() const = 0;
136 /** Creates an encoder for this codec. */
137 virtual Encoder *create_encoder(ErrorMode err_mode = THROW_ON_ERROR) const = 0;
139 /** Creates a decoder for this codec. */
140 virtual Decoder *create_decoder(ErrorMode err_mode = THROW_ON_ERROR) const = 0;
142 /** Determines whether the given string can be successfully decoded with
143 this codec. Note that this function returning true does not guarantee that
144 the string was actually encoded with this codec. In particular, many 8-bit
145 encodings are indistinguishable. */
146 virtual bool detect(const std::string &) const;
149 typedef Codec::Encoder Encoder;
150 typedef Codec::Decoder Decoder;
152 /** Convenience function that decodes a string. */
154 ustring decode(const std::string &s)
156 typename C::Decoder dec;
158 dec.decode(s, result);
162 /** Convenience function that encodes a string. */
164 std::string encode(const ustring &s)
166 typename C::Encoder enc;
168 enc.encode(s, result);
173 /** Convenience function that transcodes a string from one codec to another. */
174 template<class F, class T>
175 std::string transcode(const std::string &s)
177 typename F::Decoder from;
178 typename T::Encoder to;
180 from.decode(s, temp);
182 to.encode(temp, result);
187 /** Creates a codec for an encoding by name. The caller is responsible for
188 deleting the codec when it's no longer needed. */
189 Codec *create_codec(const std::string &);
191 /** Automatically detects the encoding of a string and creates a codec for it.
192 The codec must be deleted when it's no longer needed. */
193 Codec *detect_codec(const std::string &);
195 } // namespace Codecs