]> git.tdb.fi Git - libs/core.git/blob - source/stringcodec/utf16.cpp
Add move semantics to Variant
[libs/core.git] / source / stringcodec / utf16.cpp
1 #include "utf16.h"
2
3 using namespace std;
4
5 namespace Msp {
6 namespace StringCodec {
7
8 Utf16::Encoder::Encoder(ErrorMode em, Endian en):
9         Codec::Encoder(em),
10         endian(en==AUTO ? BIG : en)
11 { }
12
13 void Utf16::Encoder::encode_char(unichar ch, string &buf)
14 {
15         if(!is_valid_unichar(ch))
16                 return error(ch, buf, invalid_character(ch, "UTF-16"));
17
18         if(emit_bom)
19         {
20                 if(endian==LITTLE)
21                         buf.append("\xFF\xFE");
22                 else
23                         buf.append("\xFE\xFF");
24                 emit_bom = false;
25         }
26
27         bool e = (endian==LITTLE);
28         if(ch<0x10000)
29         {
30                 char utf[2];
31                 utf[e] = ch>>8;
32                 utf[1-e] = ch;
33                 buf.append(utf, 2);
34         }
35         else
36         {
37                 char utf[4];
38                 ch -= 0x10000;
39                 unichar sur = 0xD800+((ch>>10)&0x3FF);
40                 utf[e] = sur>>8;
41                 utf[1-e] = sur;
42                 sur = 0xDC00+(ch&0x3FF);
43                 utf[2+e] = sur>>8;
44                 utf[3-e] = sur;
45                 buf.append(utf, 4);
46         }
47 }
48
49 void Utf16::Encoder::transliterate(unichar, string &buf)
50 {
51         if(endian==LITTLE)
52                 buf.append("\xFD\xFF", 2);
53         else
54                 buf.append("\xFF\xFD", 2);
55 }
56
57
58 Utf16::Decoder::Decoder(ErrorMode em, Endian en):
59         Codec::Decoder(em),
60         endian(en)
61 { }
62
63 unichar Utf16::Decoder::decode_char(const string &str, string::const_iterator &i)
64 {
65         if(i==str.end())
66                 return -1;
67
68         auto j = i;
69
70         unichar unit = decode_unit(str, i, j);
71         if(unit!=-1)
72         {
73                 if(endian==AUTO)
74                 {
75                         /* Set endian based on the first decoded unit.  If the unit was a BOM,
76                         discard it. */
77                         if(unit==0xFFFE)
78                         {
79                                 endian = LITTLE;
80                                 unit = -1;
81                         }
82                         else
83                         {
84                                 endian = BIG;
85                                 if(unit==0xFEFF)
86                                         unit = -1;
87                         }
88                 }
89
90                 if(unit==-1 && j!=str.end())
91                         unit = decode_unit(str, i, j);
92         }
93
94         unichar result = -1;
95         if(unit!=-1)
96         {
97                 if(unit>=0xD800 && unit<=0xDBFF)
98                 {
99                         auto k = j;
100
101                         unichar unit2 = -2;
102                         if(k!=str.end())
103                                 unit2 = decode_unit(str, i, k);
104
105                         if(unit2>=0xDC00 && unit2<=0xDFFF)
106                         {
107                                 j = k;
108                                 result = 0x10000 + ((unit&0x3FF)<<10) + (unit2&0x3FF);
109                         }
110                         else if(unit2!=-1)
111                                 result = error(invalid_sequence(i, j, "incomplete UTF-16 surrogate pair"));
112                 }
113                 else if(unit>=0xDC00 && unit<=0xDFFF)
114                         result = error(invalid_sequence(i, j, "stray UTF-16 trail surrogate"));
115                 else
116                         result = unit;
117         }
118
119         i = j;
120         return result;
121 }
122
123 unichar Utf16::Decoder::decode_unit(const string &str, const string::const_iterator &i, string::const_iterator &j)
124 {
125         unsigned char b1 = *j++;
126         if(j==str.end())
127                 return error(invalid_sequence(i, j, "incomplete UTF-16 character"));
128         unsigned char b2 = *j++;
129
130         if(endian==LITTLE)
131                 return (b2<<8) | b1;
132         else
133                 return (b1<<8) | b2;
134 }
135
136 } // namespace StringCodec
137 } // namespace Msp