From fdb0d473cc3e10dc40b8dcc98ebfff1acb91951d Mon Sep 17 00:00:00 2001 From: Mikko Rasa Date: Tue, 4 Sep 2012 13:15:46 +0300 Subject: [PATCH] Add a UTF-16 codec --- source/stringcodec/codec.cpp | 4 + source/stringcodec/codec.h | 7 +- source/stringcodec/utf16.cpp | 138 +++++++++++++++++++++++++++++++++++ source/stringcodec/utf16.h | 99 +++++++++++++++++++++++++ 4 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 source/stringcodec/utf16.cpp create mode 100644 source/stringcodec/utf16.h diff --git a/source/stringcodec/codec.cpp b/source/stringcodec/codec.cpp index 865d9c7..21ddf8a 100644 --- a/source/stringcodec/codec.cpp +++ b/source/stringcodec/codec.cpp @@ -6,6 +6,7 @@ #include "iso885915.h" #include "jisx0201.h" #include "jisx0208.h" +#include "utf16.h" #include "utf8.h" #include "windows1252.h" @@ -96,6 +97,9 @@ Codec *create_codec(const string &n) if(name=="jisx0201") return new JisX0201(em); if(name=="jisx0208") return new JisX0208(em); if(name=="utf8") return new Utf8(em); + if(name=="utf16") return new Utf16(em, Utf16::AUTO); + if(name=="utf16be") return new Utf16(em, Utf16::BIG); + if(name=="utf16le") return new Utf16(em, Utf16::LITTLE); if(name=="windows1252" || name=="cp1252") return new Windows1252(em); throw invalid_argument("unknown string codec"); } diff --git a/source/stringcodec/codec.h b/source/stringcodec/codec.h index d0871ff..1b89347 100644 --- a/source/stringcodec/codec.h +++ b/source/stringcodec/codec.h @@ -167,12 +167,15 @@ private: protected: StandardCodec(ErrorMode em): err_mode(em==DEFAULT ? THROW_ON_ERROR : em) { } + ErrorMode get_error_mode(ErrorMode em = DEFAULT) const + { return (em==DEFAULT ? err_mode : em); } + public: virtual Encoder *create_encoder(ErrorMode em = DEFAULT) const - { return new typename C::Encoder(em==DEFAULT ? err_mode : em); } + { return new typename C::Encoder(get_error_mode(em)); } virtual Decoder *create_decoder(ErrorMode em = DEFAULT) const - { return new typename C::Decoder(em==DEFAULT ? err_mode : em); } + { return new typename C::Decoder(get_error_mode(em)); } }; diff --git a/source/stringcodec/utf16.cpp b/source/stringcodec/utf16.cpp new file mode 100644 index 0000000..fed994e --- /dev/null +++ b/source/stringcodec/utf16.cpp @@ -0,0 +1,138 @@ +#include "utf16.h" + +using namespace std; + +namespace Msp { +namespace StringCodec { + +Utf16::Encoder::Encoder(ErrorMode em, Endian en): + Codec::Encoder(em), + endian(en==AUTO ? BIG : en), + emit_bom(true) +{ } + +void Utf16::Encoder::encode_char(unichar ch, string &buf) +{ + if(!is_valid_unichar(ch)) + return error(ch, buf, invalid_character(ch, "UTF-16")); + + if(emit_bom) + { + if(endian==LITTLE) + buf.append("\xFF\xFE"); + else + buf.append("\xFE\xFF"); + emit_bom = false; + } + + bool e = (endian==LITTLE); + if(ch<0x10000) + { + char utf[2]; + utf[e] = ch>>8; + utf[1-e] = ch; + buf.append(utf, 2); + } + else + { + char utf[4]; + ch -= 0x10000; + unichar sur = 0xD800+((ch>>10)&0x3FF); + utf[e] = sur>>8; + utf[1-e] = sur; + sur = 0xDC00+(ch&0x3FF); + utf[2+e] = sur>>8; + utf[3-e] = sur; + buf.append(utf, 4); + } +} + +void Utf16::Encoder::transliterate(unichar, std::string &buf) +{ + if(endian==LITTLE) + buf.append("\xFD\xFF", 2); + else + buf.append("\xFF\xFD", 2); +} + + +Utf16::Decoder::Decoder(ErrorMode em, Endian en): + Codec::Decoder(em), + endian(en) +{ } + +unichar Utf16::Decoder::decode_char(const string &str, string::const_iterator &i) +{ + if(i==str.end()) + return -1; + + string::const_iterator j = i; + + unichar unit = decode_unit(str, i, j); + if(unit!=-1) + { + if(endian==AUTO) + { + /* Set endian based on the first decoded unit. If the unit was a BOM, + discard it. */ + if(unit==0xFFFE) + { + endian = LITTLE; + unit = -1; + } + else + { + endian = BIG; + if(unit==0xFEFF) + unit = -1; + } + } + + if(unit==-1 && j!=str.end()) + unit = decode_unit(str, i, j); + } + + unichar result = -1; + if(unit!=-1) + { + if(unit>=0xD800 && unit<=0xDBFF) + { + string::const_iterator k = j; + + unichar unit2 = -2; + if(k!=str.end()) + unit2 = decode_unit(str, i, k); + + if(unit2>=0xDC00 && unit2<=0xDFFF) + { + j = k; + result = 0x10000 + ((unit&0x3FF)<<10) + (unit2&0x3FF); + } + else if(unit2!=-1) + result = error(invalid_sequence(i, j, "incomplete UTF-16 surrogate pair")); + } + else if(unit>=0xDC00 && unit<=0xDFFF) + result = error(invalid_sequence(i, j, "stray UTF-16 trail surrogate")); + else + result = unit; + } + + i = j; + return result; +} + +unichar Utf16::Decoder::decode_unit(const string &str, const string::const_iterator &i, string::const_iterator &j) +{ + unsigned char b1 = *j++; + if(j==str.end()) + return error(invalid_sequence(i, j, "incomplete UTF-16 character")); + unsigned char b2 = *j++; + + if(endian==LITTLE) + return (b2<<8) | b1; + else + return (b1<<8) | b2; +} + +} // namespace StringCodec +} // namespace Msp diff --git a/source/stringcodec/utf16.h b/source/stringcodec/utf16.h new file mode 100644 index 0000000..729ab2e --- /dev/null +++ b/source/stringcodec/utf16.h @@ -0,0 +1,99 @@ +#ifndef MSP_STRINGCODEC_UTF16_H_ +#define MSP_STRINGCODEC_UTF16_H_ + +#include "codec.h" + +namespace Msp { +namespace StringCodec { + +/** +The UTF-16 codec, as specified in the Unicode standard. Both little and big +endian are supported, as well as autodetection with the BOM. In the absence +of a BOM, big endian is assumed. +*/ +class Utf16: public StandardCodec +{ +public: + enum Endian + { + AUTO, + BIG, + LITTLE + }; + + class Encoder: public Codec::Encoder + { + private: + Endian endian; + bool emit_bom; + + public: + Encoder(ErrorMode em = DEFAULT, Endian en = BIG); + + virtual void encode_char(unichar, std::string &); + private: + virtual void transliterate(unichar, std::string &); + }; + + class Decoder: public Codec::Decoder + { + private: + Endian endian; + + public: + Decoder(ErrorMode em = DEFAULT, Endian en = AUTO); + + virtual unichar decode_char(const std::string &, std::string::const_iterator &); + private: + unichar decode_unit(const std::string &, const std::string::const_iterator &, std::string::const_iterator &); + }; + +private: + Endian endian; + +public: + Utf16(ErrorMode em = DEFAULT, Endian en = AUTO): + StandardCodec(em), + endian(en) + { } + + virtual const char *get_name() const + { return endian==BIG ? "UTF-16-BE" : "UTF-16-LE"; } + + virtual Encoder *create_encoder(ErrorMode em = DEFAULT) const + { return new Encoder(get_error_mode(em), endian); } + + virtual Decoder *create_decoder(ErrorMode em = DEFAULT) const + { return new Decoder(get_error_mode(em), endian); } +}; + + +/** +A helper template to define the Utf16Be and Utf16Le types. +*/ +template +class Utf16E: public Utf16 +{ +public: + class Encoder: public Utf16::Encoder + { + public: + Encoder(ErrorMode em = DEFAULT): Utf16::Encoder(em, en) { } + }; + + class Decoder: public Utf16::Decoder + { + public: + Decoder(ErrorMode em = DEFAULT): Utf16::Decoder(em, en) { } + }; + + Utf16E(ErrorMode em = DEFAULT): Utf16(em, en) { } +}; + +typedef Utf16E Utf16Be; +typedef Utf16E Utf16Le; + +} // namespace StringCodec +} // namespace Msp + +#endif -- 2.45.2