]> git.tdb.fi Git - libs/core.git/blob - source/stringcodec/utf16.cpp
Add a UTF-16 codec
[libs/core.git] / source / stringcodec / utf16.cpp
1 #include "utf16.h"
2
3 using namespace std;
4
5 namespace Msp {
6 namespace StringCodec {
7
8 Utf16::Encoder::Encoder(ErrorMode em, Endian en):
9         Codec::Encoder(em),
10         endian(en==AUTO ? BIG : en),
11         emit_bom(true)
12 { }
13
14 void Utf16::Encoder::encode_char(unichar ch, string &buf)
15 {
16         if(!is_valid_unichar(ch))
17                 return error(ch, buf, invalid_character(ch, "UTF-16"));
18
19         if(emit_bom)
20         {
21                 if(endian==LITTLE)
22                         buf.append("\xFF\xFE");
23                 else
24                         buf.append("\xFE\xFF");
25                 emit_bom = false;
26         }
27
28         bool e = (endian==LITTLE);
29         if(ch<0x10000)
30         {
31                 char utf[2];
32                 utf[e] = ch>>8;
33                 utf[1-e] = ch;
34                 buf.append(utf, 2);
35         }
36         else
37         {
38                 char utf[4];
39                 ch -= 0x10000;
40                 unichar sur = 0xD800+((ch>>10)&0x3FF);
41                 utf[e] = sur>>8;
42                 utf[1-e] = sur;
43                 sur = 0xDC00+(ch&0x3FF);
44                 utf[2+e] = sur>>8;
45                 utf[3-e] = sur;
46                 buf.append(utf, 4);
47         }
48 }
49
50 void Utf16::Encoder::transliterate(unichar, std::string &buf)
51 {
52         if(endian==LITTLE)
53                 buf.append("\xFD\xFF", 2);
54         else
55                 buf.append("\xFF\xFD", 2);
56 }
57
58
59 Utf16::Decoder::Decoder(ErrorMode em, Endian en):
60         Codec::Decoder(em),
61         endian(en)
62 { }
63
64 unichar Utf16::Decoder::decode_char(const string &str, string::const_iterator &i)
65 {
66         if(i==str.end())
67                 return -1;
68
69         string::const_iterator j = i;
70
71         unichar unit = decode_unit(str, i, j);
72         if(unit!=-1)
73         {
74                 if(endian==AUTO)
75                 {
76                         /* Set endian based on the first decoded unit.  If the unit was a BOM,
77                         discard it. */
78                         if(unit==0xFFFE)
79                         {
80                                 endian = LITTLE;
81                                 unit = -1;
82                         }
83                         else
84                         {
85                                 endian = BIG;
86                                 if(unit==0xFEFF)
87                                         unit = -1;
88                         }
89                 }
90
91                 if(unit==-1 && j!=str.end())
92                         unit = decode_unit(str, i, j);
93         }
94
95         unichar result = -1;
96         if(unit!=-1)
97         {
98                 if(unit>=0xD800 && unit<=0xDBFF)
99                 {
100                         string::const_iterator k = j;
101
102                         unichar unit2 = -2;
103                         if(k!=str.end())
104                                 unit2 = decode_unit(str, i, k);
105
106                         if(unit2>=0xDC00 && unit2<=0xDFFF)
107                         {
108                                 j = k;
109                                 result = 0x10000 + ((unit&0x3FF)<<10) + (unit2&0x3FF);
110                         }
111                         else if(unit2!=-1)
112                                 result = error(invalid_sequence(i, j, "incomplete UTF-16 surrogate pair"));
113                 }
114                 else if(unit>=0xDC00 && unit<=0xDFFF)
115                         result = error(invalid_sequence(i, j, "stray UTF-16 trail surrogate"));
116                 else
117                         result = unit;
118         }
119
120         i = j;
121         return result;
122 }
123
124 unichar Utf16::Decoder::decode_unit(const string &str, const string::const_iterator &i, string::const_iterator &j)
125 {
126         unsigned char b1 = *j++;
127         if(j==str.end())
128                 return error(invalid_sequence(i, j, "incomplete UTF-16 character"));
129         unsigned char b2 = *j++;
130
131         if(endian==LITTLE)
132                 return (b2<<8) | b1;
133         else
134                 return (b1<<8) | b2;
135 }
136
137 } // namespace StringCodec
138 } // namespace Msp