#include "iso2022jp.h"
#include "iso646fi.h"
#include "iso88591.h"
+#include "iso885915.h"
#include "jisx0201.h"
#include "jisx0208.h"
#include "utf8.h"
if(name=="iso2022jp") return new Iso2022Jp;
if(name=="iso646fi") return new Iso646Fi;
if(name=="iso88591" || name=="latin1") return new Iso88591;
+ if(name=="iso885915" || name=="latin9") return new Iso885915;
if(name=="jisx0201") return new JisX0201;
if(name=="jisx0208") return new JisX0208;
if(name=="utf8") return new Utf8;
Base class for string codecs. Use one of the derived classes or the function
create_codec to create a specific codec.
-For the purposes of this class, an ustring is considered to contain
-Unicode characters and an std::string is considered to be an encoded sequence
-of bytes. A codec is able to determine if an encoded string could be decoded
-with it.
+Unicode strings are represented as ustrings. An std::string is considered to
+be an encoded sequence of bytes. A codec is able to determine if an encoded
+string could be decoded with it.
*/
class Codec
{
public:
/**
- Base class for string encoder. Each codec class should contain an Encoder
- class derived from this.
+ Base class for string encoder.
+
+ Each codec class should contain an Encoder class derived from this. The
+ encode_char and transliterate functions must be overloaded. Some encoders
+ may find it useful or necessary to implement some other functions too
+ (particularly sync and reset for stateful codecs).
*/
class Encoder
{
public:
virtual ~Encoder() { }
+ /**
+ Encodes a single unicode character. If the character can't be
+ represented in this encoding, behavior depends on the error mode
+ specified for the encoder:
+
+ For THROW_ON_ERROR, an exception is thrown.
+
+ For IGNORE_ERRORS, nothing is done.
+
+ For TRANSLITERATE, the encoder attempts to select a character or a string
+ or characters that closely approximates the non-representable character.
+ */
virtual void encode_char(UnicodeChar ch, std::string &buf) =0;
+
+ /**
+ Encodes a unicode string. This is equivalent to callind encode_char for
+ each character in the string with the same buffer.
+ */
virtual void encode(const ustring &str, std::string &buf);
+
+ /**
+ Procuces a sequence of bytes that will bring the encoder back to the
+ initial state.
+ */
virtual void sync(std::string &buf) { (void)buf; }
+
+ /**
+ Resets the encoder to the initial state without producing output.
+ */
virtual void reset() { }
protected:
ErrorMode err_mode;
Encoder(ErrorMode em): err_mode(em) { }
- void error(UnicodeChar, std::string &, const std::string &);
- virtual void transliterate(UnicodeChar, std::string &) { }
+
+ /**
+ Handles an error depending on the error mode.
+
+ For THROW_ON_ERROR, throws CodecError(msg).
+
+ For IGNORE_ERROR, does nothing.
+
+ For TRANSLITERATE, calls transliterate(ch, buf).
+ */
+ void error(UnicodeChar ch, std::string &buf, const std::string &msg);
+
+ /**
+ Attempts to produce an alternative encoding for a unicode character.
+ Typically this includes dropping accent marks or romanizing letters.
+ */
+ virtual void transliterate(UnicodeChar ch, std::string &buf) =0;
};
/**
- Base class for string decoder. Each codec class should contain an Decoder
- class derived from this.
+ Base class for string decoder.
+
+ Each codec class should contain an Decoder class derived from this.
*/
class Decoder
{
*/
virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0;
+ /**
+ Decodes a string.
+ */
virtual void decode(const std::string &str, ustring &buf);
+
+ /**
+ Resets the decoder to the initial state.
+ */
virtual void reset() { }
protected:
ErrorMode err_mode;
Decoder(ErrorMode em): err_mode(em) { }
+
+ /**
+ Handles an error depending on the error mode.
+ */
UnicodeChar error(const std::string &);
};
typedef Codec::Decoder Decoder;
/**
-Convenience function that decodes a string using the given codec.
+Convenience function that decodes a string.
*/
template<class C>
ustring decode(const std::string &s)
return result;
}
+/**
+Convenience function that encodes a string.
+*/
template<class C>
std::string encode(const ustring &s)
{
return result;
}
+/**
+Convenience function that transcodes a string from one codec to another.
+*/
template<class F, class T>
std::string transcode(const std::string &s)
{
--- /dev/null
+/* $Id$
+
+This file is part of libmspstrings
+Copyright © 2006-2007 Mikko Rasa
+Distributed under the LGPL
+*/
+
+#include "codecutils.h"
+
+namespace Msp {
+namespace Codecs {
+
+int transform_mapping_or_direct(const int *mapping, unsigned map_size, int ch, bool reverse)
+{
+ for(unsigned i=0; i<map_size*2; i+=2)
+ {
+ if(mapping[i+reverse]==ch)
+ return mapping[i+1-reverse];
+ else if(mapping[i+1-reverse]==ch)
+ return -1;
+ }
+
+ return ch;
+}
+
+} // namespace Codecs
+} // namespace Msp
--- /dev/null
+/* $Id$
+
+This file is part of libmspstrings
+Copyright © 2006-2007 Mikko Rasa
+Distributed under the LGPL
+*/
+
+#ifndef MSP_STRINGS_CODECUTILS_H_
+#define MSP_STRINGS_CODECUTILS_H_
+
+namespace Msp {
+namespace Codecs {
+
+int transform_mapping_or_direct(const int *mapping, unsigned map_size, int ch, bool reverse);
+
+} // namespace Codecs
+} // namespace Msp
+
+#endif
Distributed under the LGPL
*/
+#include "codecutils.h"
#include "iso646fi.h"
using namespace std;
+namespace {
+
+const unsigned map_size=9;
+
+const int mapping[map_size*2]=
+{
+ 0xC4, 0x5B,
+ 0xC5, 0x5D,
+ 0xD6, 0x5C,
+ 0xDC, 0x5E,
+ 0xE4, 0x7B,
+ 0xE5, 0x7D,
+ 0xE9, 0x60,
+ 0xF6, 0x7C,
+ 0xFC, 0x7E
+};
+
+} // namespace
+
namespace Msp {
namespace Codecs {
void Iso646Fi::Encoder::encode_char(UnicodeChar ch, string &buf)
{
- if((ch>=0 && ch<=0x5A) || ch==0x5F || (ch>=0x61 && ch<=0x7A))
- buf+=ch;
- else if(ch==0xC4)
- buf+=0x5B;
- else if(ch==0xC5)
- buf+=0x5D;
- else if(ch==0xD6)
- buf+=0x5C;
- else if(ch==0xDC)
- buf+=0x5E;
- else if(ch==0xE4)
- buf+=0x7B;
- else if(ch==0xE5)
- buf+=0x7D;
- else if(ch==0xE9)
- buf+=0x60;
- else if(ch==0xF6)
- buf+=0x7C;
- else if(ch==0xFC)
- buf+=0x7E;
- else
+ int tch=transform_mapping_or_direct(mapping, map_size, ch, false);
+ if(tch<0 || tch>0x7F)
error(ch, buf, "Can't express character in ISO-646-FI");
+ else
+ buf+=tch;
}
void Iso646Fi::Encoder::transliterate(UnicodeChar, string &buf)
return error("No input");
unsigned char ch=*i;
- UnicodeChar result=-1;
- if(ch==0x5B)
- result=0xC4;
- else if(ch==0x5C)
- result=0xD6;
- else if(ch==0x5D)
- result=0xC5;
- else if(ch==0x5E)
- result=0xDC;
- else if(ch==0x60)
- result=0xE9;
- else if(ch==0x7B)
- result=0xE4;
- else if(ch==0x7C)
- result=0xF6;
- else if(ch==0x7D)
- result=0xE5;
- else if(ch==0x7E)
- result=0xFC;
- else if(ch<=0x7F)
- result=ch;
- else
+ int tch=(ch<=0x7F ? transform_mapping_or_direct(mapping, map_size, ch, true) : -1);
+
+ UnicodeChar result;
+ if(tch==-1)
result=error("Undefined ISO-646-FI character");
+ else
+ result=tch;
++i;
return result;
--- /dev/null
+/* $Id$
+
+This file is part of libmspstrings
+Copyright © 2006-2007 Mikko Rasa
+Distributed under the LGPL
+*/
+
+#include "codecutils.h"
+#include "iso885915.h"
+
+using namespace std;
+
+namespace {
+
+const unsigned map_size=8;
+
+const int mapping[map_size*2]=
+{
+ 0x0152, 0xBC,
+ 0x0153, 0xBD,
+ 0x0160, 0xA6,
+ 0x0161, 0xA8,
+ 0x0178, 0xBE,
+ 0x017D, 0xB4,
+ 0x017E, 0xB8,
+ 0x20AC, 0xA4
+};
+
+}
+
+namespace Msp {
+namespace Codecs {
+
+void Iso885915::Encoder::encode_char(UnicodeChar ch, string &buf)
+{
+ int tch=transform_mapping_or_direct(mapping, map_size, ch, false);
+ if(tch<0 || tch>0xFF)
+ error(ch, buf, "Can't express character in ISO-8859-15");
+ else
+ buf+=tch;
+
+}
+
+void Iso885915::Encoder::transliterate(UnicodeChar, string &buf)
+{
+ buf+='?';
+}
+
+
+UnicodeChar Iso885915::Decoder::decode_char(const string &str, string::const_iterator &i)
+{
+ if(i==str.end())
+ return error("No input");
+
+ unsigned char ch=*i;
+ int tch=transform_mapping_or_direct(mapping, map_size, ch, true);
+
+ UnicodeChar result;
+ if(tch==-1)
+ result=error("Undefined ISO-8859-15 character");
+ else
+ result=tch;
+
+ ++i;
+ return result;
+}
+
+} // namespace Codecs
+} // namespace Msp
--- /dev/null
+/* $Id$
+
+This file is part of libmspstrings
+Copyright © 2007 Mikko Rasa
+Distributed under the LGPL
+*/
+
+#ifndef MSP_STRINGS_ISO885915_H_
+#define MSP_STRINGS_ISO885915_H_
+
+#include "codec.h"
+
+namespace Msp {
+namespace Codecs {
+
+class Iso885915: public Codec
+{
+public:
+ class Encoder: public Codec::Encoder
+ {
+ public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): Codec::Encoder(em) { }
+ virtual void encode_char(UnicodeChar, std::string &);
+ private:
+ virtual void transliterate(UnicodeChar, std::string &);
+ };
+
+ class Decoder: public Codec::Decoder
+ {
+ public:
+ Decoder(ErrorMode em=THROW_ON_ERROR): Codec::Decoder(em) { }
+ virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &);
+ };
+
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
+};
+
+} // namespace Codecs
+} // namespace Msp
+
+#endif