]> git.tdb.fi Git - libs/core.git/blob - source/utf8.cpp
f95ad9c27a9ea1b76b86d273432f3c74b21b7fea
[libs/core.git] / source / utf8.cpp
1 #include "utf8.h"
2
3 using namespace std;
4
5 namespace Msp {
6
7 void Utf8::Encoder::encode_char(wchar_t c)
8 {
9         unsigned code=c;
10         if(code>0x10FFFF)
11                 throw CodecError("Can't express character in UTF-8");
12
13         unsigned bytes=1;
14         if(code>0xFFFF)
15                 bytes=4;
16         else if(code>0x7FF)
17                 bytes=3;
18         else if(code>0x7F)
19                 bytes=2;
20
21         if(bytes==1)
22                 append(code);
23         else
24         {
25                 char buf[4];
26                 
27                 buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6);
28                 for(unsigned j=bytes-1; j>0; --j)
29                 {
30                         buf[j]=0x80 | code&0x3F;
31                         code>>=6;
32                 }
33
34                 append(buf, bytes);
35         }
36 }
37
38
39 void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
40 {
41         while(i!=str.end())
42         {
43                 if(bytes==0)
44                 {
45                         if((*i&0xC0)==0x80)
46                                 throw CodecError("Invalid UTF-8 string (tail byte when expecting head)");
47
48                         else if(*i&0x80)
49                         {
50                                 unsigned mask=0x40;
51                                 for(; *i&mask; mask>>=1)
52                                         ++bytes;
53
54                                 if(bytes>3)
55                                         throw CodecError("Invalid UTF-8 string (overlong multibyte sequence)");
56
57                                 code=(*i++)&(mask-1);
58                                 if(!code)
59                                         throw CodecError("Invalid UTF-8 string (denormalized multibyte sequence)");
60                         }
61                         else
62                         {
63                                 append(*i++);
64                                 break;
65                         }
66                 }
67                 else
68                 {
69                         if((*i&0xC0)!=0x80)
70                                 throw CodecError("Invalid UTF-8 string (head byte when expecting tail)");
71
72                         code=code<<6 | (*i++)&0x3F;
73                         --bytes;
74
75                         if(!bytes)
76                         {
77                                 if(code>0x10FFFF)
78                                         throw CodecError("Invalid UTF-8 string (character out of range)");
79                                 append(code);
80                                 break;
81                         }
82                 }
83         }
84 }
85
86 void Utf8::Decoder::sync()
87 {
88         if(bytes)
89                 throw CodecError("Sync in the middle of multibyte UTF-8 sequence");
90 }
91
92 void Utf8::Decoder::reset()
93 {
94         bytes=0;
95 }
96
97 } // namespace Msp