Add a UTF-16 codec

author Mikko Rasa <tdb@tdb.fi>

Tue, 4 Sep 2012 10:15:46 +0000 (13:15 +0300)

committer Mikko Rasa <tdb@tdb.fi>

Tue, 4 Sep 2012 10:15:46 +0000 (13:15 +0300)
author Mikko Rasa <tdb@tdb.fi>
Tue, 4 Sep 2012 10:15:46 +0000 (13:15 +0300)
committer Mikko Rasa <tdb@tdb.fi>
Tue, 4 Sep 2012 10:15:46 +0000 (13:15 +0300)
diff --git a/source/stringcodec/codec.cpp b/source/stringcodec/codec.cpp

index 865d9c7de1cca44e64e64c0dccab8ad49c5a8c0b..21ddf8a7307ff8071551f4eb0473dfff48f05898 100644 (file)
--- a/source/stringcodec/codec.cpp
+++ b/source/stringcodec/codec.cpp
@@ -6,6 +6,7 @@
  #include "iso885915.h"
  #include "jisx0201.h"
  #include "jisx0208.h"
+#include "utf16.h"
  #include "utf8.h"
  #include "windows1252.h"
  
@@ -96,6 +97,9 @@ Codec *create_codec(const string &n)
         if(name=="jisx0201") return new JisX0201(em);
         if(name=="jisx0208") return new JisX0208(em);
         if(name=="utf8") return new Utf8(em);
+       if(name=="utf16") return new Utf16(em, Utf16::AUTO);
+       if(name=="utf16be") return new Utf16(em, Utf16::BIG);
+       if(name=="utf16le") return new Utf16(em, Utf16::LITTLE);
         if(name=="windows1252" || name=="cp1252") return new Windows1252(em);
         throw invalid_argument("unknown string codec");
  }
diff --git a/source/stringcodec/codec.h b/source/stringcodec/codec.h

index d0871ff119ab563ea2ff000ece006c55577ddf48..1b893470c6e9462993d10953e36952b4fc08b6d1 100644 (file)
--- a/source/stringcodec/codec.h
+++ b/source/stringcodec/codec.h
@@ -167,12 +167,15 @@ private:
  protected:
         StandardCodec(ErrorMode em): err_mode(em==DEFAULT ? THROW_ON_ERROR : em) { }
  
+       ErrorMode get_error_mode(ErrorMode em = DEFAULT) const
+       { return (em==DEFAULT ? err_mode : em); }
+
  public:
         virtual Encoder *create_encoder(ErrorMode em = DEFAULT) const
-       { return new typename C::Encoder(em==DEFAULT ? err_mode : em); }
+       { return new typename C::Encoder(get_error_mode(em)); }
  
         virtual Decoder *create_decoder(ErrorMode em = DEFAULT) const
-       { return new typename C::Decoder(em==DEFAULT ? err_mode : em); }
+       { return new typename C::Decoder(get_error_mode(em)); }
  };
  
  
diff --git a/source/stringcodec/utf16.cpp b/source/stringcodec/utf16.cpp

new file mode 100644 (file)

index 0000000..fed994e
--- /dev/null
+++ b/source/stringcodec/utf16.cpp
@@ -0,0 +1,138 @@
+#include "utf16.h"
+
+using namespace std;
+
+namespace Msp {
+namespace StringCodec {
+
+Utf16::Encoder::Encoder(ErrorMode em, Endian en):
+       Codec::Encoder(em),
+       endian(en==AUTO ? BIG : en),
+       emit_bom(true)
+{ }
+
+void Utf16::Encoder::encode_char(unichar ch, string &buf)
+{
+       if(!is_valid_unichar(ch))
+               return error(ch, buf, invalid_character(ch, "UTF-16"));
+
+       if(emit_bom)
+       {
+               if(endian==LITTLE)
+                       buf.append("\xFF\xFE");
+               else
+                       buf.append("\xFE\xFF");
+               emit_bom = false;
+       }
+
+       bool e = (endian==LITTLE);
+       if(ch<0x10000)
+       {
+               char utf[2];
+               utf[e] = ch>>8;
+               utf[1-e] = ch;
+               buf.append(utf, 2);
+       }
+       else
+       {
+               char utf[4];
+               ch -= 0x10000;
+               unichar sur = 0xD800+((ch>>10)&0x3FF);
+               utf[e] = sur>>8;
+               utf[1-e] = sur;
+               sur = 0xDC00+(ch&0x3FF);
+               utf[2+e] = sur>>8;
+               utf[3-e] = sur;
+               buf.append(utf, 4);
+       }
+}
+
+void Utf16::Encoder::transliterate(unichar, std::string &buf)
+{
+       if(endian==LITTLE)
+               buf.append("\xFD\xFF", 2);
+       else
+               buf.append("\xFF\xFD", 2);
+}
+
+
+Utf16::Decoder::Decoder(ErrorMode em, Endian en):
+       Codec::Decoder(em),
+       endian(en)
+{ }
+
+unichar Utf16::Decoder::decode_char(const string &str, string::const_iterator &i)
+{
+       if(i==str.end())
+               return -1;
+
+       string::const_iterator j = i;
+
+       unichar unit = decode_unit(str, i, j);
+       if(unit!=-1)
+       {
+               if(endian==AUTO)
+               {
+                       /* Set endian based on the first decoded unit.  If the unit was a BOM,
+                       discard it. */
+                       if(unit==0xFFFE)
+                       {
+                               endian = LITTLE;
+                               unit = -1;
+                       }
+                       else
+                       {
+                               endian = BIG;
+                               if(unit==0xFEFF)
+                                       unit = -1;
+                       }
+               }
+
+               if(unit==-1 && j!=str.end())
+                       unit = decode_unit(str, i, j);
+       }
+
+       unichar result = -1;
+       if(unit!=-1)
+       {
+               if(unit>=0xD800 && unit<=0xDBFF)
+               {
+                       string::const_iterator k = j;
+
+                       unichar unit2 = -2;
+                       if(k!=str.end())
+                               unit2 = decode_unit(str, i, k);
+
+                       if(unit2>=0xDC00 && unit2<=0xDFFF)
+                       {
+                               j = k;
+                               result = 0x10000 + ((unit&0x3FF)<<10) + (unit2&0x3FF);
+                       }
+                       else if(unit2!=-1)
+                               result = error(invalid_sequence(i, j, "incomplete UTF-16 surrogate pair"));
+               }
+               else if(unit>=0xDC00 && unit<=0xDFFF)
+                       result = error(invalid_sequence(i, j, "stray UTF-16 trail surrogate"));
+               else
+                       result = unit;
+       }
+
+       i = j;
+       return result;
+}
+
+unichar Utf16::Decoder::decode_unit(const string &str, const string::const_iterator &i, string::const_iterator &j)
+{
+       unsigned char b1 = *j++;
+       if(j==str.end())
+               return error(invalid_sequence(i, j, "incomplete UTF-16 character"));
+       unsigned char b2 = *j++;
+
+       if(endian==LITTLE)
+               return (b2<<8) | b1;
+       else
+               return (b1<<8) | b2;
+}
+
+} // namespace StringCodec
+} // namespace Msp
diff --git a/source/stringcodec/utf16.h b/source/stringcodec/utf16.h

new file mode 100644 (file)

index 0000000..729ab2e
--- /dev/null
+++ b/source/stringcodec/utf16.h
@@ -0,0 +1,99 @@
+#ifndef MSP_STRINGCODEC_UTF16_H_
+#define MSP_STRINGCODEC_UTF16_H_
+
+#include "codec.h"
+
+namespace Msp {
+namespace StringCodec {
+
+/**
+The UTF-16 codec, as specified in the Unicode standard.  Both little and big
+endian are supported, as well as autodetection with the BOM.  In the absence
+of a BOM, big endian is assumed.
+*/
+class Utf16: public StandardCodec<Utf16>
+{
+public:
+       enum Endian
+       {
+               AUTO,
+               BIG,
+               LITTLE
+       };
+
+       class Encoder: public Codec::Encoder
+       {
+       private:
+               Endian endian;
+               bool emit_bom;
+
+       public:
+               Encoder(ErrorMode em = DEFAULT, Endian en = BIG);
+
+               virtual void encode_char(unichar, std::string &);
+       private:
+               virtual void transliterate(unichar, std::string &);
+       };
+
+       class Decoder: public Codec::Decoder
+       {
+       private:
+               Endian endian;
+
+       public:
+               Decoder(ErrorMode em = DEFAULT, Endian en = AUTO);
+
+               virtual unichar decode_char(const std::string &, std::string::const_iterator &);
+       private:
+               unichar decode_unit(const std::string &, const std::string::const_iterator &, std::string::const_iterator &);
+       };
+
+private:
+       Endian endian;
+
+public:
+       Utf16(ErrorMode em = DEFAULT, Endian en = AUTO):
+               StandardCodec<Utf16>(em),
+               endian(en)
+       { }
+
+       virtual const char *get_name() const
+       { return endian==BIG ? "UTF-16-BE" : "UTF-16-LE"; }
+
+       virtual Encoder *create_encoder(ErrorMode em = DEFAULT) const
+       { return new Encoder(get_error_mode(em), endian); }
+
+       virtual Decoder *create_decoder(ErrorMode em = DEFAULT) const
+       { return new Decoder(get_error_mode(em), endian); }
+};
+
+
+/**
+A helper template to define the Utf16Be and Utf16Le types.
+*/
+template<Utf16::Endian en>
+class Utf16E: public Utf16
+{
+public:
+       class Encoder: public Utf16::Encoder
+       {
+       public:
+               Encoder(ErrorMode em = DEFAULT): Utf16::Encoder(em, en) { }
+       };
+
+       class Decoder: public Utf16::Decoder
+       {
+       public:
+               Decoder(ErrorMode em = DEFAULT): Utf16::Decoder(em, en) { }
+       };
+
+       Utf16E(ErrorMode em = DEFAULT): Utf16(em, en) { }
+};
+
+typedef Utf16E<Utf16::BIG> Utf16Be;
+typedef Utf16E<Utf16::LITTLE> Utf16Le;
+
+} // namespace StringCodec
+} // namespace Msp
+
+#endif
author	Mikko Rasa <tdb@tdb.fi>
	Tue, 4 Sep 2012 10:15:46 +0000 (13:15 +0300)
committer	Mikko Rasa <tdb@tdb.fi>
	Tue, 4 Sep 2012 10:15:46 +0000 (13:15 +0300)
source/stringcodec/codec.cpp		patch \| blob \| history
source/stringcodec/codec.h		patch \| blob \| history
source/stringcodec/utf16.cpp	[new file with mode: 0644]	patch \| blob
source/stringcodec/utf16.h	[new file with mode: 0644]	patch \| blob