X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fcodec.h;h=bb35b0b43528ca4eba14dc66682f43ae26bca651;hp=1f012710f636f0acc208aff0bc87ab3edad8ab5e;hb=b254c706617223da1dd1b9543a74715e42a8a5b0;hpb=9da6abdcabec59f4845da256a8ad75a810ed1589

diff --git a/source/codec.h b/source/codec.h
index 1f01271..bb35b0b 100644
--- a/source/codec.h
+++ b/source/codec.h
@@ -9,9 +9,21 @@ Distributed under the LGPL
 #define MSP_STRINGS_CODEC_H_
 
 #include <string>
-#include <msp/error.h>
+#include <msp/core/except.h>
 
 namespace Msp {
+namespace Codecs {
+
+typedef int UnicodeChar;
+
+typedef std::basic_string<UnicodeChar> ustring;
+
+enum ErrorMode
+{
+	THROW_ON_ERROR,
+	IGNORE_ERRORS,
+	TRANSLITERATE
+};
 
 /**
 An exception thrown for all kinds of problems encountered while encoding or
@@ -27,149 +39,199 @@ public:
 Base class for string codecs.  Use one of the derived classes or the function
 create_codec to create a specific codec.
 
-For the purposes of this class, an std::wstring is considered to contain
-Unicode characters and an std::string is considered to be an encoded sequence
-of bytes.  A codec is able to determine if an encoded string could be decoded
-with it.
+Unicode strings are represented as ustrings.  An std::string is considered to
+be an encoded sequence of bytes.  A codec is able to determine if an encoded
+string could be decoded with it.
 */
-class StringCodec
+class Codec
 {
 public:
-	enum ErrorMode
-	{
-		THROW_ON_ERROR,
-		IGNORE_ERRORS,
-		REPLACE_ERRORS
-	};
-
 	/**
-	Base class for string encoder.  Each codec class should contain an Encoder
-	class derived from this.
+	Base class for string encoder.
+
+	Each codec class should contain an Encoder class derived from this.  The
+	encode_char and transliterate functions must be overloaded.  Some encoders
+	may find it useful or necessary to implement some other functions too
+	(particularly sync and reset for stateful codecs).
 	*/
 	class Encoder
 	{
 	public:
+		virtual ~Encoder() { }
+
 		/**
-		Encodes a single character.  Derived classes should use the append
-		function to put the result into the internal buffer.
+		Encodes a single unicode character.  If the character can't be
+		represented in this encoding, behavior depends on the error mode
+		specified for the encoder:
+
+		For THROW_ON_ERROR, an exception is thrown.
+
+		For IGNORE_ERRORS, nothing is done.
+
+		For TRANSLITERATE, the encoder attempts to select a character or a string
+		or characters that closely approximates the non-representable character.
 		*/
-		virtual void encode_char(wchar_t) =0;
+		virtual void encode_char(UnicodeChar ch, std::string &buf) =0;
 
 		/**
-		Encodes a string.
+		Encodes a unicode string.  This is equivalent to callind encode_char for
+		each character in the string with the same buffer.
 		*/
-		virtual void encode(const std::wstring &s)
-		{ for(std::wstring::const_iterator i=s.begin(); i!=s.end(); ++i) encode_char(*i); }
+		virtual void encode(const ustring &str, std::string &buf);
 
 		/**
-		Brings the encoder back to its initial state.  This allows the encoded
-		sequence to be extracted or flushed without loss of integrity.
+		Procuces a sequence of bytes that will bring the encoder back to the
+		initial state.
 		*/
-		virtual void sync() { }
+		virtual void sync(std::string &buf) { (void)buf; }
 
 		/**
-		Returns a reference to the encoded sequence.  Call sync() first to make
-		sure it's a valid encoded string by itself.
+		Resets the encoder to the initial state without producing output.
 		*/
-		const std::string &get() const { return buffer_; }
+		virtual void reset() { }
+	protected:
+		ErrorMode err_mode;
+
+		Encoder(ErrorMode em): err_mode(em) { }
 
 		/**
-		Returns the number of bytes in the output buffer.
+		Handles an error depending on the error mode.
+
+		For THROW_ON_ERROR, throws CodecError(msg).
+
+		For IGNORE_ERROR, does nothing.
+
+		For TRANSLITERATE, calls transliterate(ch, buf).
 		*/
-		unsigned size() const { return buffer_.size(); }
+		void error(UnicodeChar ch, std::string &buf, const std::string &msg);
 
 		/**
-		Clears the encoded sequence.  Encoder state is left intact.
+		Attempts to produce an alternative encoding for a unicode character.
+		Typically this includes dropping accent marks or romanizing letters.
 		*/
-		void flush() { buffer_.clear(); }
-
-		virtual ~Encoder() { }
-	protected:
-		Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { }
-		void append(char c) { buffer_+=c; }
-		void append(const char *s, unsigned l) { buffer_.append(s, l); }
-		void append(const std::string &s) { buffer_+=s; }
-		void error(const std::string &);
-		virtual void append_replacement() { }
-	private:
-		ErrorMode err_mode_;
-		std::string buffer_;
+		virtual void transliterate(UnicodeChar ch, std::string &buf) =0;
 	};
 
 	/**
-	Base class for string decoder.  Each codec class should contain an Decoder
-	class derived from this.
+	Base class for string decoder.
+
+	Each codec class should contain an Decoder class derived from this.
 	*/
 	class Decoder
 	{
 	public:
-		virtual void decode_char(const std::string &, std::string::const_iterator &) =0;
-		virtual void decode(const std::string &s)
-		{ for(std::string::const_iterator i=s.begin(); i!=s.end(); ) decode_char(s, i); }
+		virtual ~Decoder() { }
 
 		/**
-		Ensures that all input has been processed.  If this is not the case any
-		buffers are cleared and an error is triggered.
+		Decodes a single character from a string.  The iterator is advanced to
+		the next character.  For stateful codecs, -1 may be returned if a state
+		change sequence was decoded but no character followed it.  In case a
+		decoding error occurs, behavior depends on the error mode specified for
+		the decoder:
+
+		For THROW_ON_ERROR, an exception is thrown and the iterator is left at
+		the erroneous character.
+
+		For IGNORE_ERRORS, -1 is returned and the iterator is advanced.
+
+		For TRANSLITERATE, 0xFFFE is returned and the iterator is advanced.
 		*/
-		virtual void sync() { }
+		virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0;
 
-		const std::wstring &get() const { return buffer_; }
-		unsigned size() const { return buffer_.size(); }
-		void flush() { buffer_.clear(); }
-		virtual ~Decoder() { }
+		/**
+		Decodes a string.
+		*/
+		virtual void decode(const std::string &str, ustring &buf);
+
+		/**
+		Resets the decoder to the initial state.
+		*/
+		virtual void reset() { }
 	protected:
-		Decoder(ErrorMode em): err_mode_(em) { }
-		void append(wchar_t c) { buffer_+=c; }
-		void append(const std::wstring &s) { buffer_+=s; }
-		void error(const std::string &);
-	private:
-		ErrorMode err_mode_;
-		std::wstring buffer_;
+		ErrorMode err_mode;
+
+		Decoder(ErrorMode em): err_mode(em) { }
+
+		/**
+		Handles an error depending on the error mode.
+		*/
+		UnicodeChar error(const std::string &);
 	};
 
-	virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0;
-	virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0;
-	virtual bool    detect(const std::string &) const;
-	virtual ~StringCodec() { }
+	virtual ~Codec() { }
+
+	/**
+	Creates an encoder for this codec.
+	*/
+	virtual Encoder *create_encoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
+
+	/**
+	Creates a decoder for this codec.
+	*/
+	virtual Decoder *create_decoder(ErrorMode err_mode=THROW_ON_ERROR) const =0;
+
+	/**
+	Determines whether the given string can be successfully decoded with this
+	codec.  Note that this function returning true does not guarantee that the
+	string was actually encoded with this codec.  In particular, many 8-bit
+	encodings are indistinguishable.
+	*/
+	virtual bool detect(const std::string &) const;
 protected:
-	StringCodec() { }
+	Codec() { }
 };
 
+typedef Codec::Encoder Encoder;
+typedef Codec::Decoder Decoder;
+
 /**
-Convenience function that decodes a string using the given codec.
+Convenience function that decodes a string.
 */
 template<class C>
-std::wstring decode(const std::string &s)
+ustring decode(const std::string &s)
 {
 	typename C::Decoder dec;
-	dec.decode(s);
-	dec.sync();
-	return dec.get();
+	ustring result;
+	dec.decode(s, result);
+	return result;
 }
 
+/**
+Convenience function that encodes a string.
+*/
 template<class C>
-std::string encode(const std::wstring &s)
+std::string encode(const ustring &s)
 {
 	typename C::Encoder enc;
-	enc.encode(s);
-	enc.sync();
-	return enc.get();
+	std::string result;
+	enc.encode(s, result);
+	enc.sync(result);
+	return result;
 }
 
+/**
+Convenience function that transcodes a string from one codec to another.
+*/
 template<class F, class T>
 std::string transcode(const std::string &s)
 {
 	typename F::Decoder from;
 	typename T::Encoder to;
-	from.decode(s);
-	from.sync();
-	to.encode(from.get());
-	to.sync();
-	return to.get();
+	ustring temp;
+	from.decode(s, temp);
+	std::string result;
+	to.encode(temp, result);
+	to.sync(result);
+	return result;
 }
 
-StringCodec *create_codec(const std::string &);
+/**
+Creates a codec for an encoding by name.  The caller is responsible for
+deleting the codec when it's no longer needed.
+*/
+Codec *create_codec(const std::string &);
 
+} // namespace Codecs
 } // namespace Msp
 
 #endif