From: Mikko Rasa Date: Fri, 10 Aug 2007 18:58:07 +0000 (+0000) Subject: Add a function to perform simple character mapping X-Git-Tag: strings-1.0~10 X-Git-Url: http://git.tdb.fi/?a=commitdiff_plain;h=c88a239ff3b218ad0226e8c53356350ccc1f7014;p=libs%2Fcore.git Add a function to perform simple character mapping Add ISO-8859-15 encoding More documentation in codec.h (nearly complete) --- diff --git a/source/codec.cpp b/source/codec.cpp index c38d828..865176e 100644 --- a/source/codec.cpp +++ b/source/codec.cpp @@ -10,6 +10,7 @@ Distributed under the LGPL #include "iso2022jp.h" #include "iso646fi.h" #include "iso88591.h" +#include "iso885915.h" #include "jisx0201.h" #include "jisx0208.h" #include "utf8.h" @@ -97,6 +98,7 @@ Codec *create_codec(const string &n) if(name=="iso2022jp") return new Iso2022Jp; if(name=="iso646fi") return new Iso646Fi; if(name=="iso88591" || name=="latin1") return new Iso88591; + if(name=="iso885915" || name=="latin9") return new Iso885915; if(name=="jisx0201") return new JisX0201; if(name=="jisx0208") return new JisX0208; if(name=="utf8") return new Utf8; diff --git a/source/codec.h b/source/codec.h index bed3d93..3470ccc 100644 --- a/source/codec.h +++ b/source/codec.h @@ -39,38 +39,83 @@ public: Base class for string codecs. Use one of the derived classes or the function create_codec to create a specific codec. -For the purposes of this class, an ustring is considered to contain -Unicode characters and an std::string is considered to be an encoded sequence -of bytes. A codec is able to determine if an encoded string could be decoded -with it. +Unicode strings are represented as ustrings. An std::string is considered to +be an encoded sequence of bytes. A codec is able to determine if an encoded +string could be decoded with it. */ class Codec { public: /** - Base class for string encoder. Each codec class should contain an Encoder - class derived from this. + Base class for string encoder. + + Each codec class should contain an Encoder class derived from this. The + encode_char and transliterate functions must be overloaded. Some encoders + may find it useful or necessary to implement some other functions too + (particularly sync and reset for stateful codecs). */ class Encoder { public: virtual ~Encoder() { } + /** + Encodes a single unicode character. If the character can't be + represented in this encoding, behavior depends on the error mode + specified for the encoder: + + For THROW_ON_ERROR, an exception is thrown. + + For IGNORE_ERRORS, nothing is done. + + For TRANSLITERATE, the encoder attempts to select a character or a string + or characters that closely approximates the non-representable character. + */ virtual void encode_char(UnicodeChar ch, std::string &buf) =0; + + /** + Encodes a unicode string. This is equivalent to callind encode_char for + each character in the string with the same buffer. + */ virtual void encode(const ustring &str, std::string &buf); + + /** + Procuces a sequence of bytes that will bring the encoder back to the + initial state. + */ virtual void sync(std::string &buf) { (void)buf; } + + /** + Resets the encoder to the initial state without producing output. + */ virtual void reset() { } protected: ErrorMode err_mode; Encoder(ErrorMode em): err_mode(em) { } - void error(UnicodeChar, std::string &, const std::string &); - virtual void transliterate(UnicodeChar, std::string &) { } + + /** + Handles an error depending on the error mode. + + For THROW_ON_ERROR, throws CodecError(msg). + + For IGNORE_ERROR, does nothing. + + For TRANSLITERATE, calls transliterate(ch, buf). + */ + void error(UnicodeChar ch, std::string &buf, const std::string &msg); + + /** + Attempts to produce an alternative encoding for a unicode character. + Typically this includes dropping accent marks or romanizing letters. + */ + virtual void transliterate(UnicodeChar ch, std::string &buf) =0; }; /** - Base class for string decoder. Each codec class should contain an Decoder - class derived from this. + Base class for string decoder. + + Each codec class should contain an Decoder class derived from this. */ class Decoder { @@ -93,12 +138,23 @@ public: */ virtual UnicodeChar decode_char(const std::string &str, std::string::const_iterator &i) =0; + /** + Decodes a string. + */ virtual void decode(const std::string &str, ustring &buf); + + /** + Resets the decoder to the initial state. + */ virtual void reset() { } protected: ErrorMode err_mode; Decoder(ErrorMode em): err_mode(em) { } + + /** + Handles an error depending on the error mode. + */ UnicodeChar error(const std::string &); }; @@ -129,7 +185,7 @@ typedef Codec::Encoder Encoder; typedef Codec::Decoder Decoder; /** -Convenience function that decodes a string using the given codec. +Convenience function that decodes a string. */ template ustring decode(const std::string &s) @@ -140,6 +196,9 @@ ustring decode(const std::string &s) return result; } +/** +Convenience function that encodes a string. +*/ template std::string encode(const ustring &s) { @@ -150,6 +209,9 @@ std::string encode(const ustring &s) return result; } +/** +Convenience function that transcodes a string from one codec to another. +*/ template std::string transcode(const std::string &s) { diff --git a/source/codecutils.cpp b/source/codecutils.cpp new file mode 100644 index 0000000..6afbf95 --- /dev/null +++ b/source/codecutils.cpp @@ -0,0 +1,27 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2006-2007 Mikko Rasa +Distributed under the LGPL +*/ + +#include "codecutils.h" + +namespace Msp { +namespace Codecs { + +int transform_mapping_or_direct(const int *mapping, unsigned map_size, int ch, bool reverse) +{ + for(unsigned i=0; i=0 && ch<=0x5A) || ch==0x5F || (ch>=0x61 && ch<=0x7A)) - buf+=ch; - else if(ch==0xC4) - buf+=0x5B; - else if(ch==0xC5) - buf+=0x5D; - else if(ch==0xD6) - buf+=0x5C; - else if(ch==0xDC) - buf+=0x5E; - else if(ch==0xE4) - buf+=0x7B; - else if(ch==0xE5) - buf+=0x7D; - else if(ch==0xE9) - buf+=0x60; - else if(ch==0xF6) - buf+=0x7C; - else if(ch==0xFC) - buf+=0x7E; - else + int tch=transform_mapping_or_direct(mapping, map_size, ch, false); + if(tch<0 || tch>0x7F) error(ch, buf, "Can't express character in ISO-646-FI"); + else + buf+=tch; } void Iso646Fi::Encoder::transliterate(UnicodeChar, string &buf) @@ -50,29 +53,13 @@ UnicodeChar Iso646Fi::Decoder::decode_char(const string &str, string::const_iter return error("No input"); unsigned char ch=*i; - UnicodeChar result=-1; - if(ch==0x5B) - result=0xC4; - else if(ch==0x5C) - result=0xD6; - else if(ch==0x5D) - result=0xC5; - else if(ch==0x5E) - result=0xDC; - else if(ch==0x60) - result=0xE9; - else if(ch==0x7B) - result=0xE4; - else if(ch==0x7C) - result=0xF6; - else if(ch==0x7D) - result=0xE5; - else if(ch==0x7E) - result=0xFC; - else if(ch<=0x7F) - result=ch; - else + int tch=(ch<=0x7F ? transform_mapping_or_direct(mapping, map_size, ch, true) : -1); + + UnicodeChar result; + if(tch==-1) result=error("Undefined ISO-646-FI character"); + else + result=tch; ++i; return result; diff --git a/source/iso885915.cpp b/source/iso885915.cpp new file mode 100644 index 0000000..b0fc2ef --- /dev/null +++ b/source/iso885915.cpp @@ -0,0 +1,69 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2006-2007 Mikko Rasa +Distributed under the LGPL +*/ + +#include "codecutils.h" +#include "iso885915.h" + +using namespace std; + +namespace { + +const unsigned map_size=8; + +const int mapping[map_size*2]= +{ + 0x0152, 0xBC, + 0x0153, 0xBD, + 0x0160, 0xA6, + 0x0161, 0xA8, + 0x0178, 0xBE, + 0x017D, 0xB4, + 0x017E, 0xB8, + 0x20AC, 0xA4 +}; + +} + +namespace Msp { +namespace Codecs { + +void Iso885915::Encoder::encode_char(UnicodeChar ch, string &buf) +{ + int tch=transform_mapping_or_direct(mapping, map_size, ch, false); + if(tch<0 || tch>0xFF) + error(ch, buf, "Can't express character in ISO-8859-15"); + else + buf+=tch; + +} + +void Iso885915::Encoder::transliterate(UnicodeChar, string &buf) +{ + buf+='?'; +} + + +UnicodeChar Iso885915::Decoder::decode_char(const string &str, string::const_iterator &i) +{ + if(i==str.end()) + return error("No input"); + + unsigned char ch=*i; + int tch=transform_mapping_or_direct(mapping, map_size, ch, true); + + UnicodeChar result; + if(tch==-1) + result=error("Undefined ISO-8859-15 character"); + else + result=tch; + + ++i; + return result; +} + +} // namespace Codecs +} // namespace Msp diff --git a/source/iso885915.h b/source/iso885915.h new file mode 100644 index 0000000..da26922 --- /dev/null +++ b/source/iso885915.h @@ -0,0 +1,42 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2007 Mikko Rasa +Distributed under the LGPL +*/ + +#ifndef MSP_STRINGS_ISO885915_H_ +#define MSP_STRINGS_ISO885915_H_ + +#include "codec.h" + +namespace Msp { +namespace Codecs { + +class Iso885915: public Codec +{ +public: + class Encoder: public Codec::Encoder + { + public: + Encoder(ErrorMode em=THROW_ON_ERROR): Codec::Encoder(em) { } + virtual void encode_char(UnicodeChar, std::string &); + private: + virtual void transliterate(UnicodeChar, std::string &); + }; + + class Decoder: public Codec::Decoder + { + public: + Decoder(ErrorMode em=THROW_ON_ERROR): Codec::Decoder(em) { } + virtual UnicodeChar decode_char(const std::string &, std::string::const_iterator &); + }; + + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } +}; + +} // namespace Codecs +} // namespace Msp + +#endif