1 #ifndef MSP_STRINGCODEC_CODEC_H_
2 #define MSP_STRINGCODEC_CODEC_H_
5 #include <msp/core/mspcore_api.h>
10 namespace StringCodec {
22 Base class for string codecs. Use one of the derived classes or the function
23 create_codec to create a specific codec.
25 Unicode strings are represented as ustrings. An std::string is considered to
26 be an encoded sequence of bytes. A codec is able to determine if an encoded
27 string could be decoded with it.
29 class MSPCORE_API Codec
33 Base class for string encoder.
35 Each codec class should contain an Encoder class derived from this. The
36 encode_char and transliterate functions must be overloaded. Some encoders
37 may find it useful or necessary to implement some other functions too
38 (particularly sync and reset for stateful codecs).
40 class MSPCORE_API Encoder
43 ErrorMode err_mode = THROW_ON_ERROR;
45 Encoder(ErrorMode em): err_mode(em==DEFAULT ? THROW_ON_ERROR : em) { }
47 virtual ~Encoder() = default;
49 /** Encodes a single unicode character. If the character can't be
50 represented in this encoding, error() should be called. */
51 virtual void encode_char(unichar ch, std::string &buf) = 0;
53 /** Encodes a unicode string. This is equivalent to calling encode_char
54 for each character in the string with the same buffer. */
55 virtual void encode(const ustring &str, std::string &buf);
57 std::string encode(const ustring &);
59 /** Procuces a sequence of bytes that will bring the encoder back to the
61 virtual void sync(std::string &buf) { (void)buf; }
63 /** Resets the encoder to the initial state without producing output. */
64 virtual void reset() { }
67 /** Handles an error depending on the error mode.
69 THROW_ON_ERROR: throws err
70 IGNORE_ERRORS: does nothing
71 TRANSLITERATE: calls transliterate(ch, buf) */
73 void error(unichar ch, std::string &buf, const E &err)
75 if(err_mode==TRANSLITERATE)
76 transliterate(ch, buf);
77 else if(err_mode!=IGNORE_ERRORS)
81 /** Attempts to produce an alternative encoding for a unicode character.
82 Typically this includes dropping accent marks or romanizing letters. */
83 virtual void transliterate(unichar ch, std::string &buf) = 0;
87 Base class for string decoder.
89 Each codec class should contain an Decoder class derived from this.
91 class MSPCORE_API Decoder
94 ErrorMode err_mode = THROW_ON_ERROR;
96 Decoder(ErrorMode em): err_mode(em==DEFAULT ? THROW_ON_ERROR : em) { }
98 virtual ~Decoder() = default;
100 /** Decodes a single character from a string. The iterator is advanced
101 to the next character. For stateful codecs, -1 may be returned if a
102 state change sequence was decoded but no character followed it. If
103 invalid input is encountered, the error() function should be called and
104 the iterator advanced only if it doesn't throw. */
105 virtual unichar decode_char(const std::string &str, std::string::const_iterator &i) = 0;
107 /** Decodes a string. */
108 virtual void decode(const std::string &str, ustring &buf);
110 ustring decode(const std::string &);
112 /** Resets the decoder to the initial state. */
113 virtual void reset() { }
116 /** Handles an error depending on the error mode. The return value is
117 suitable for returning from decode_char.
119 THROW_ON_ERROR: throws err
120 IGNORE_ERRORS: returns -1
121 TRANSLITERATE: return 0xFFFD */
123 unichar error(const E &err)
125 if(err_mode==TRANSLITERATE)
127 else if(err_mode==IGNORE_ERRORS)
137 virtual ~Codec() = default;
139 /** Returns the name of the encoding handled by this codec. */
140 virtual const char *get_name() const = 0;
142 /** Creates an encoder for this codec. */
143 virtual Encoder *create_encoder(ErrorMode err_mode = DEFAULT) const = 0;
145 /** Creates a decoder for this codec. */
146 virtual Decoder *create_decoder(ErrorMode err_mode = DEFAULT) const = 0;
148 /** Determines whether the given string can be successfully decoded with
149 this codec. Note that this function returning true does not guarantee that
150 the string was actually encoded with this codec. In particular, many 8-bit
151 encodings are indistinguishable. */
152 virtual bool detect(const std::string &) const;
155 typedef Codec::Encoder Encoder;
156 typedef Codec::Decoder Decoder;
160 A helper class to provide some common functionality.
163 class StandardCodec: public Codec
166 ErrorMode err_mode = THROW_ON_ERROR;
169 StandardCodec(ErrorMode em): err_mode(em==DEFAULT ? THROW_ON_ERROR : em) { }
171 ErrorMode get_error_mode(ErrorMode em = DEFAULT) const
172 { return (em==DEFAULT ? err_mode : em); }
175 Encoder *create_encoder(ErrorMode em = DEFAULT) const override
176 { return new typename C::Encoder(get_error_mode(em)); }
178 Decoder *create_decoder(ErrorMode em = DEFAULT) const override
179 { return new typename C::Decoder(get_error_mode(em)); }
183 /** Convenience function that decodes a string. */
185 ustring decode(const std::string &s)
187 typename C::Decoder dec;
188 return dec.decode(s);
191 /** Convenience function that encodes a string. */
193 std::string encode(const ustring &s)
195 typename C::Encoder enc;
196 return enc.encode(s);
199 /** Convenience function that transcodes a string from one codec to another. */
200 template<typename F, typename T>
201 std::string transcode(const std::string &s)
203 return encode<T>(decode<F>(s));
206 /** Creates a codec for an encoding by name. The caller is responsible for
207 deleting the codec when it's no longer needed. */
208 MSPCORE_API Codec *create_codec(const std::string &);
210 /** Automatically detects the encoding of a string and creates a codec for it.
211 The codec must be deleted when it's no longer needed. */
212 MSPCORE_API Codec *detect_codec(const std::string &);
214 } // namespace StringCodec