]> git.tdb.fi Git - libs/core.git/blob - source/codec.cpp
Make codecs able to tell their name
[libs/core.git] / source / codec.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2006-2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7
8 #include "ascii.h"
9 #include "codec.h"
10 #include "iso2022jp.h"
11 #include "iso646fi.h"
12 #include "iso88591.h"
13 #include "iso885915.h"
14 #include "jisx0201.h"
15 #include "jisx0208.h"
16 #include "utf8.h"
17 #include "windows1252.h"
18
19 using namespace std;
20
21 namespace Msp {
22 namespace Codecs {
23
24 bool Codec::detect(const string &str) const
25 {
26         Decoder *dec=create_decoder();
27         bool result=true;
28         try
29         {
30                 for(string::const_iterator i=str.begin(); i!=str.end(); )
31                         dec->decode_char(str, i);
32         }
33         catch(const CodecError &)
34         {
35                 result=false;
36         }
37
38         delete dec;
39
40         return result;
41 }
42
43 void Codec::Encoder::encode(const ustring &str, string &buf)
44 {
45         for(ustring::const_iterator i=str.begin(); i!=str.end(); ++i)
46                 encode_char(*i, buf);
47 }
48
49 void Codec::Encoder::error(UnicodeChar ch, string &buf, const string &msg)
50 {
51         switch(err_mode)
52         {
53         case TRANSLITERATE:
54                 transliterate(ch, buf);
55         case IGNORE_ERRORS:
56                 break;
57         default:
58                 throw CodecError(msg);
59         }
60 }
61
62
63 void Codec::Decoder::decode(const string &str, ustring &buf)
64 {
65         for(string::const_iterator i=str.begin(); i!=str.end();)
66         {
67                 UnicodeChar c=decode_char(str, i);
68                 if(c!=-1)
69                         buf+=c;
70         }
71 }
72
73 UnicodeChar Codec::Decoder::error(const string &msg)
74 {
75         switch(err_mode)
76         {
77         case TRANSLITERATE:
78                 return 0xFFFE;
79         case IGNORE_ERRORS:
80                 return -1;
81         default:
82                 throw CodecError(msg);
83         }
84 }
85
86 Codec *create_codec(const string &n)
87 {
88         string name;
89         for(string::const_iterator i=n.begin(); i!=n.end(); ++i)
90         {
91                 if(isupper(*i))
92                         name+=tolower(*i);
93                 else if(islower(*i) || isdigit(*i))
94                         name+=*i;
95         }
96
97         if(name=="ascii") return new Ascii;
98         if(name=="iso2022jp") return new Iso2022Jp;
99         if(name=="iso646fi") return new Iso646Fi;
100         if(name=="iso88591" || name=="latin1") return new Iso88591;
101         if(name=="iso885915" || name=="latin9") return new Iso885915;
102         if(name=="jisx0201") return new JisX0201;
103         if(name=="jisx0208") return new JisX0208;
104         if(name=="utf8") return new Utf8;
105         if(name=="windows1252" || name=="cp1252") return new Windows1252;
106         throw InvalidParameterValue("Unknown string codec");
107 }
108
109 Codec *detect_codec(const string &str)
110 {
111         bool is_utf8=true;
112         bool is_ascii=true;
113         bool is_latin1=true;
114         unsigned utf8_mb=0;
115
116         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
117         {
118                 unsigned char c=*i;
119                 if(c&0x80)
120                 {
121                         is_ascii=false;
122                         if((c&0xC0)==0x80)
123                         {
124                                 if((c&0xE0)==0x80)
125                                         is_latin1=false;
126                                 if(utf8_mb)
127                                         --utf8_mb;
128                                 else
129                                         is_utf8=false;
130                         }
131                         else if((c&0xC0)==0xC0)
132                         {
133                                 if(utf8_mb)
134                                         is_utf8=false;
135                                 else
136                                 {
137                                         for(utf8_mb=1; (c>>(6-utf8_mb))&1; ++utf8_mb) ;
138                                 }
139                         }
140                 }
141         }
142
143         if(is_ascii)
144                 return new Ascii;
145         else if(is_utf8)
146                 return new Utf8;
147         else if(is_latin1)
148                 return new Iso88591;
149         else
150                 return new Windows1252;
151 }
152
153 } // namespace Codecs
154 } // namespace Msp