Add a UTF-16 codec
[libs/core.git] / source / stringcodec / codec.cpp
1 #include "ascii.h"
2 #include "codec.h"
3 #include "iso2022jp.h"
4 #include "iso646fi.h"
5 #include "iso88591.h"
6 #include "iso885915.h"
7 #include "jisx0201.h"
8 #include "jisx0208.h"
9 #include "utf16.h"
10 #include "utf8.h"
11 #include "windows1252.h"
12
13 using namespace std;
14
15 namespace Msp {
16 namespace StringCodec {
17
18 bool Codec::detect(const string &str) const
19 {
20         Decoder *dec = create_decoder(IGNORE_ERRORS);
21
22         bool result = true;
23         for(string::const_iterator i=str.begin(); (result && i!=str.end()); )
24                 result = (dec->decode_char(str, i)!=-1);
25
26         delete dec;
27
28         return result;
29 }
30
31 void Codec::Encoder::encode(const ustring &str, string &buf)
32 {
33         for(ustring::const_iterator i=str.begin(); i!=str.end(); ++i)
34                 encode_char(*i, buf);
35 }
36
37 string Codec::Encoder::encode(const ustring &str)
38 {
39         string buf;
40         encode(str, buf);
41         sync(buf);
42         return buf;
43 }
44
45
46
47 void Codec::Decoder::decode(const string &str, ustring &buf)
48 {
49         for(string::const_iterator i=str.begin(); i!=str.end();)
50         {
51                 unichar c = decode_char(str, i);
52                 if(c!=-1)
53                         buf += c;
54         }
55 }
56
57 ustring Codec::Decoder::decode(const string &str)
58 {
59         ustring buf;
60         decode(str, buf);
61         return buf;
62 }
63
64 Codec *create_codec(const string &n)
65 {
66         string name;
67         string::const_iterator i;
68         for(i=n.begin(); i!=n.end(); ++i)
69         {
70                 if(*i==':')
71                         break;
72                 else if(isupper(*i))
73                         name += tolower(*i);
74                 else if(islower(*i) || isdigit(*i))
75                         name += *i;
76         }
77
78         ErrorMode em = DEFAULT;
79         if(i!=n.end() && *i==':')
80         {
81                 string em_str(i+1, n.end());
82                 if(em_str=="throw")
83                         em = THROW_ON_ERROR;
84                 else if(em_str=="ignore")
85                         em = IGNORE_ERRORS;
86                 else if(em_str=="trans" || em_str=="transliterate")
87                         em = TRANSLITERATE;
88                 else
89                         throw invalid_argument("invalid error mode");
90         }
91
92         if(name=="ascii") return new Ascii(em);
93         if(name=="iso2022jp") return new Iso2022Jp(em);
94         if(name=="iso646fi") return new Iso646Fi(em);
95         if(name=="iso88591" || name=="latin1") return new Iso88591(em);
96         if(name=="iso885915" || name=="latin9") return new Iso885915(em);
97         if(name=="jisx0201") return new JisX0201(em);
98         if(name=="jisx0208") return new JisX0208(em);
99         if(name=="utf8") return new Utf8(em);
100         if(name=="utf16") return new Utf16(em, Utf16::AUTO);
101         if(name=="utf16be") return new Utf16(em, Utf16::BIG);
102         if(name=="utf16le") return new Utf16(em, Utf16::LITTLE);
103         if(name=="windows1252" || name=="cp1252") return new Windows1252(em);
104         throw invalid_argument("unknown string codec");
105 }
106
107 Codec *detect_codec(const string &str)
108 {
109         bool is_utf8 = true;
110         bool is_ascii = true;
111         bool is_latin1 = true;
112         unsigned utf8_mb = 0;
113
114         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
115         {
116                 unsigned char c = *i;
117                 if(c&0x80)
118                 {
119                         is_ascii = false;
120                         if((c&0xC0)==0x80)
121                         {
122                                 if((c&0xE0)==0x80)
123                                         is_latin1 = false;
124                                 if(utf8_mb)
125                                         --utf8_mb;
126                                 else
127                                         is_utf8 = false;
128                         }
129                         else if((c&0xC0)==0xC0)
130                         {
131                                 if(utf8_mb)
132                                 {
133                                         is_utf8 = false;
134                                         utf8_mb = 0;
135                                 }
136                                 else
137                                 {
138                                         for(utf8_mb=1; (c>>(6-utf8_mb))&1; ++utf8_mb) ;
139                                 }
140                         }
141                 }
142                 else if(utf8_mb)
143                 {
144                         is_utf8 = false;
145                         utf8_mb = 0;
146                 }
147         }
148
149         if(is_ascii)
150                 return new Ascii;
151         else if(is_utf8)
152                 return new Utf8;
153         else if(is_latin1)
154                 return new Iso88591;
155         else
156                 return new Windows1252;
157 }
158
159 } // namespace StringCodec
160 } // namespace Msp