]> git.tdb.fi Git - libs/core.git/blob - source/utf8.cpp
Add copyright notices and Id tags
[libs/core.git] / source / utf8.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2006-2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7
8 #include "utf8.h"
9
10 using namespace std;
11
12 namespace Msp {
13
14 void Utf8::Encoder::encode_char(wchar_t c)
15 {
16         unsigned code=c;
17         if(code>0x10FFFF)
18         {
19                 error("Can't express character in UTF-8");
20                 return;
21         }
22
23         unsigned bytes=1;
24         if(code>0xFFFF)
25                 bytes=4;
26         else if(code>0x7FF)
27                 bytes=3;
28         else if(code>0x7F)
29                 bytes=2;
30
31         if(bytes==1)
32                 append(code);
33         else
34         {
35                 char buf[4];
36                 
37                 buf[0]=0xFF<<(8-bytes) | code>>(bytes*6-6);
38                 for(unsigned j=bytes-1; j>0; --j)
39                 {
40                         buf[j]=0x80 | code&0x3F;
41                         code>>=6;
42                 }
43
44                 append(buf, bytes);
45         }
46 }
47
48
49 void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
50 {
51         while(i!=str.end())
52         {
53                 if(bytes==0)
54                 {
55                         if((*i&0xC0)==0x80)
56                         {
57                                 error("Invalid UTF-8 string (tail byte when expecting head)");
58                                 ++i;
59                                 break;
60                         }
61                         else if(*i&0x80)
62                         {
63                                 unsigned mask=0x40;
64                                 for(; *i&mask; mask>>=1)
65                                         ++bytes;
66
67                                 if(bytes>3)
68                                 {
69                                         error("Invalid UTF-8 string (overlong multibyte sequence)");
70                                         ++i;
71                                         break;
72                                 }
73                                 else
74                                 {
75                                         code=(*i++)&(mask-1);
76                                         if(!code)
77                                         {
78                                                 error("Invalid UTF-8 string (denormalized multibyte sequence)");
79                                                 break;
80                                         }
81                                 }
82                         }
83                         else
84                         {
85                                 append(*i++);
86                                 break;
87                         }
88                 }
89                 else
90                 {
91                         if((*i&0xC0)!=0x80)
92                         {
93                                 error("Invalid UTF-8 string (head byte when expecting tail)");
94                                 ++i;
95                                 break;
96                         }
97
98                         code=code<<6 | (*i++)&0x3F;
99                         --bytes;
100
101                         if(!bytes)
102                         {
103                                 if(code>0x10FFFF)
104                                         error("Invalid UTF-8 string (character out of range)");
105                                 else
106                                         append(code);
107                                 break;
108                         }
109                 }
110         }
111 }
112
113 void Utf8::Decoder::sync()
114 {
115         if(bytes)
116         {
117                 error("Sync in the middle of multibyte UTF-8 sequence");
118                 bytes=0;
119         }
120 }
121
122 void Utf8::Decoder::reset()
123 {
124         bytes=0;
125 }
126
127 } // namespace Msp