source/stringcodec/utf16.cpp

   1 #include "utf16.h"
   2
   3 using namespace std;
   4
   5 namespace Msp {
   6 namespace StringCodec {
   7
   8 Utf16::Encoder::Encoder(ErrorMode em, Endian en):
   9         Codec::Encoder(em),
  10         endian(en==AUTO ? BIG : en),
  11         emit_bom(true)
  12 { }
  13
  14 void Utf16::Encoder::encode_char(unichar ch, string &buf)
  15 {
  16         if(!is_valid_unichar(ch))
  17                 return error(ch, buf, invalid_character(ch, "UTF-16"));
  18
  19         if(emit_bom)
  20         {
  21                 if(endian==LITTLE)
  22                         buf.append("\xFF\xFE");
  23                 else
  24                         buf.append("\xFE\xFF");
  25                 emit_bom = false;
  26         }
  27
  28         bool e = (endian==LITTLE);
  29         if(ch<0x10000)
  30         {
  31                 char utf[2];
  32                 utf[e] = ch>>8;
  33                 utf[1-e] = ch;
  34                 buf.append(utf, 2);
  35         }
  36         else
  37         {
  38                 char utf[4];
  39                 ch -= 0x10000;
  40                 unichar sur = 0xD800+((ch>>10)&0x3FF);
  41                 utf[e] = sur>>8;
  42                 utf[1-e] = sur;
  43                 sur = 0xDC00+(ch&0x3FF);
  44                 utf[2+e] = sur>>8;
  45                 utf[3-e] = sur;
  46                 buf.append(utf, 4);
  47         }
  48 }
  49
  50 void Utf16::Encoder::transliterate(unichar, std::string &buf)
  51 {
  52         if(endian==LITTLE)
  53                 buf.append("\xFD\xFF", 2);
  54         else
  55                 buf.append("\xFF\xFD", 2);
  56 }
  57
  58
  59 Utf16::Decoder::Decoder(ErrorMode em, Endian en):
  60         Codec::Decoder(em),
  61         endian(en)
  62 { }
  63
  64 unichar Utf16::Decoder::decode_char(const string &str, string::const_iterator &i)
  65 {
  66         if(i==str.end())
  67                 return -1;
  68
  69         string::const_iterator j = i;
  70
  71         unichar unit = decode_unit(str, i, j);
  72         if(unit!=-1)
  73         {
  74                 if(endian==AUTO)
  75                 {
  76                         /* Set endian based on the first decoded unit.  If the unit was a BOM,
  77                         discard it. */
  78                         if(unit==0xFFFE)
  79                         {
  80                                 endian = LITTLE;
  81                                 unit = -1;
  82                         }
  83                         else
  84                         {
  85                                 endian = BIG;
  86                                 if(unit==0xFEFF)
  87                                         unit = -1;
  88                         }
  89                 }
  90
  91                 if(unit==-1 && j!=str.end())
  92                         unit = decode_unit(str, i, j);
  93         }
  94
  95         unichar result = -1;
  96         if(unit!=-1)
  97         {
  98                 if(unit>=0xD800 && unit<=0xDBFF)
  99                 {
 100                         string::const_iterator k = j;
 101
 102                         unichar unit2 = -2;
 103                         if(k!=str.end())
 104                                 unit2 = decode_unit(str, i, k);
 105
 106                         if(unit2>=0xDC00 && unit2<=0xDFFF)
 107                         {
 108                                 j = k;
 109                                 result = 0x10000 + ((unit&0x3FF)<<10) + (unit2&0x3FF);
 110                         }
 111                         else if(unit2!=-1)
 112                                 result = error(invalid_sequence(i, j, "incomplete UTF-16 surrogate pair"));
 113                 }
 114                 else if(unit>=0xDC00 && unit<=0xDFFF)
 115                         result = error(invalid_sequence(i, j, "stray UTF-16 trail surrogate"));
 116                 else
 117                         result = unit;
 118         }
 119
 120         i = j;
 121         return result;
 122 }
 123
 124 unichar Utf16::Decoder::decode_unit(const string &str, const string::const_iterator &i, string::const_iterator &j)
 125 {
 126         unsigned char b1 = *j++;
 127         if(j==str.end())
 128                 return error(invalid_sequence(i, j, "incomplete UTF-16 character"));
 129         unsigned char b2 = *j++;
 130
 131         if(endian==LITTLE)
 132                 return (b2<<8) | b1;
 133         else
 134                 return (b1<<8) | b2;
 135 }
 136
 137 } // namespace StringCodec
 138 } // namespace Msp