]> git.tdb.fi Git - libs/core.git/blob - source/stringcodec/codec.cpp
Use format instead of stringstream in Regex
[libs/core.git] / source / stringcodec / codec.cpp
1 #include "ascii.h"
2 #include "codec.h"
3 #include "iso2022jp.h"
4 #include "iso646fi.h"
5 #include "iso88591.h"
6 #include "iso885915.h"
7 #include "jisx0201.h"
8 #include "jisx0208.h"
9 #include "utf8.h"
10 #include "windows1252.h"
11
12 using namespace std;
13
14 namespace Msp {
15 namespace StringCodec {
16
17 bool Codec::detect(const string &str) const
18 {
19         Decoder *dec = create_decoder(IGNORE_ERRORS);
20
21         bool result = true;
22         for(string::const_iterator i=str.begin(); (result && i!=str.end()); )
23                 result = (dec->decode_char(str, i)!=-1);
24
25         delete dec;
26
27         return result;
28 }
29
30 void Codec::Encoder::encode(const ustring &str, string &buf)
31 {
32         for(ustring::const_iterator i=str.begin(); i!=str.end(); ++i)
33                 encode_char(*i, buf);
34 }
35
36 string Codec::Encoder::encode(const ustring &str)
37 {
38         string buf;
39         encode(str, buf);
40         sync(buf);
41         return buf;
42 }
43
44
45
46 void Codec::Decoder::decode(const string &str, ustring &buf)
47 {
48         for(string::const_iterator i=str.begin(); i!=str.end();)
49         {
50                 unichar c = decode_char(str, i);
51                 if(c!=-1)
52                         buf += c;
53         }
54 }
55
56 ustring Codec::Decoder::decode(const string &str)
57 {
58         ustring buf;
59         decode(str, buf);
60         return buf;
61 }
62
63 Codec *create_codec(const string &n)
64 {
65         string name;
66         string::const_iterator i;
67         for(i=n.begin(); i!=n.end(); ++i)
68         {
69                 if(*i==':')
70                         break;
71                 else if(isupper(*i))
72                         name += tolower(*i);
73                 else if(islower(*i) || isdigit(*i))
74                         name += *i;
75         }
76
77         ErrorMode em = THROW_ON_ERROR;
78         if(i!=n.end() && *i==':')
79         {
80                 string em_str(i+1, n.end());
81                 if(em_str=="throw")
82                         em = THROW_ON_ERROR;
83                 else if(em_str=="ignore")
84                         em = IGNORE_ERRORS;
85                 else if(em_str=="trans" || em_str=="transliterate")
86                         em = TRANSLITERATE;
87                 else
88                         throw invalid_argument("invalid error mode");
89         }
90
91         if(name=="ascii") return new Ascii(em);
92         if(name=="iso2022jp") return new Iso2022Jp(em);
93         if(name=="iso646fi") return new Iso646Fi(em);
94         if(name=="iso88591" || name=="latin1") return new Iso88591(em);
95         if(name=="iso885915" || name=="latin9") return new Iso885915(em);
96         if(name=="jisx0201") return new JisX0201(em);
97         if(name=="jisx0208") return new JisX0208(em);
98         if(name=="utf8") return new Utf8(em);
99         if(name=="windows1252" || name=="cp1252") return new Windows1252(em);
100         throw invalid_argument("unknown string codec");
101 }
102
103 Codec *detect_codec(const string &str)
104 {
105         bool is_utf8 = true;
106         bool is_ascii = true;
107         bool is_latin1 = true;
108         unsigned utf8_mb = 0;
109
110         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
111         {
112                 unsigned char c = *i;
113                 if(c&0x80)
114                 {
115                         is_ascii = false;
116                         if((c&0xC0)==0x80)
117                         {
118                                 if((c&0xE0)==0x80)
119                                         is_latin1 = false;
120                                 if(utf8_mb)
121                                         --utf8_mb;
122                                 else
123                                         is_utf8 = false;
124                         }
125                         else if((c&0xC0)==0xC0)
126                         {
127                                 if(utf8_mb)
128                                 {
129                                         is_utf8 = false;
130                                         utf8_mb = 0;
131                                 }
132                                 else
133                                 {
134                                         for(utf8_mb=1; (c>>(6-utf8_mb))&1; ++utf8_mb) ;
135                                 }
136                         }
137                 }
138                 else if(utf8_mb)
139                 {
140                         is_utf8 = false;
141                         utf8_mb = 0;
142                 }
143         }
144
145         if(is_ascii)
146                 return new Ascii;
147         else if(is_utf8)
148                 return new Utf8;
149         else if(is_latin1)
150                 return new Iso88591;
151         else
152                 return new Windows1252;
153 }
154
155 } // namespace StringCodec
156 } // namespace Msp