]> git.tdb.fi Git - libs/core.git/blob - source/utf8.cpp
More sophisticated error handling
[libs/core.git] / source / utf8.cpp
1 #include "utf8.h"
2
3 using namespace std;
4
5 namespace Msp {
6
7 void Utf8::Encoder::encode_char(wchar_t c)
8 {
9         unsigned code=c;
10         if(code>0x10FFFF)
11         {
12                 error("Can't express character in UTF-8");
13                 return;
14         }
15
16         unsigned bytes=1;
17         if(code>0xFFFF)
18                 bytes=4;
19         else if(code>0x7FF)
20                 bytes=3;
21         else if(code>0x7F)
22                 bytes=2;
23
24         if(bytes==1)
25                 append(code);
26         else
27         {
28                 char buf[4];
29                 
30                 buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6);
31                 for(unsigned j=bytes-1; j>0; --j)
32                 {
33                         buf[j]=0x80 | code&0x3F;
34                         code>>=6;
35                 }
36
37                 append(buf, bytes);
38         }
39 }
40
41
42 void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
43 {
44         while(i!=str.end())
45         {
46                 if(bytes==0)
47                 {
48                         if((*i&0xC0)==0x80)
49                         {
50                                 error("Invalid UTF-8 string (tail byte when expecting head)");
51                                 ++i;
52                                 break;
53                         }
54                         else if(*i&0x80)
55                         {
56                                 unsigned mask=0x40;
57                                 for(; *i&mask; mask>>=1)
58                                         ++bytes;
59
60                                 if(bytes>3)
61                                 {
62                                         error("Invalid UTF-8 string (overlong multibyte sequence)");
63                                         ++i;
64                                         break;
65                                 }
66                                 else
67                                 {
68                                         code=(*i++)&(mask-1);
69                                         if(!code)
70                                         {
71                                                 error("Invalid UTF-8 string (denormalized multibyte sequence)");
72                                                 break;
73                                         }
74                                 }
75                         }
76                         else
77                         {
78                                 append(*i++);
79                                 break;
80                         }
81                 }
82                 else
83                 {
84                         if((*i&0xC0)!=0x80)
85                         {
86                                 error("Invalid UTF-8 string (head byte when expecting tail)");
87                                 ++i;
88                                 break;
89                         }
90
91                         code=code<<6 | (*i++)&0x3F;
92                         --bytes;
93
94                         if(!bytes)
95                         {
96                                 if(code>0x10FFFF)
97                                         error("Invalid UTF-8 string (character out of range)");
98                                 else
99                                         append(code);
100                                 break;
101                         }
102                 }
103         }
104 }
105
106 void Utf8::Decoder::sync()
107 {
108         if(bytes)
109         {
110                 error("Sync in the middle of multibyte UTF-8 sequence");
111                 bytes=0;
112         }
113 }
114
115 void Utf8::Decoder::reset()
116 {
117         bytes=0;
118 }
119
120 } // namespace Msp