]> git.tdb.fi Git - libs/core.git/blob - source/utf8.cpp
Add a function to perform simple character mapping
[libs/core.git] / source / utf8.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2006-2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7
8 #include "utf8.h"
9
10 using namespace std;
11
12 namespace Msp {
13 namespace Codecs {
14
15 void Utf8::Encoder::encode_char(UnicodeChar ch, string &buf)
16 {
17         if(ch<0 || ch>0x10FFFF)
18                 return error(ch, buf, "Can't express character in UTF-8");
19
20         unsigned bytes=1;
21         if(ch>0xFFFF)
22                 bytes=4;
23         else if(ch>0x7FF)
24                 bytes=3;
25         else if(ch>0x7F)
26                 bytes=2;
27
28         if(bytes==1)
29                 buf+=ch;
30         else
31         {
32                 char utf[4];
33
34                 utf[0]=0xFF<<(8-bytes) | ch>>(bytes*6-6);
35                 for(unsigned j=bytes-1; j>0; --j)
36                 {
37                         utf[j]=0x80 | ch&0x3F;
38                         ch>>=6;
39                 }
40
41                 buf.append(utf, bytes);
42         }
43 }
44
45 void Utf8::Encoder::transliterate(UnicodeChar, string &buf)
46 {
47         buf.append("\357\277\275", 3);  // � U+FFFE Replacement Character
48 }
49
50
51 UnicodeChar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
52 {
53         if(i==str.end())
54                 return error("No input");
55
56         if((*i&0xC0)==0x80)
57         {
58                 UnicodeChar result=error("UTF-8 tail byte found when expecting head");
59                 ++i;
60                 return result;
61         }
62         else if(*i&0x80)
63         {
64                 unsigned bytes=2;
65                 unsigned mask=0x20;
66                 for(; *i&mask; mask>>=1)
67                         ++bytes;
68
69                 string::const_iterator j=i;
70
71                 UnicodeChar result=(*j++)&(mask-1);
72
73                 unsigned k;
74                 for(k=1; (k<bytes && j!=str.end()); ++k)
75                 {
76                         if((*j&0xC0)!=0x80)
77                         {
78                                 result=error("Incomplete UTF-8 character");
79                                 i=j;
80                                 return result;
81                         }
82                         result=result<<6 | (*j++)&0x3F;
83                 }
84
85                 if(k<bytes)
86                         result=error("Incomplete UTF-8 character at end of input");
87                 else if(!result>>(bytes*6-6))
88                         result=error("Denormalized UTF-8 multibyte sequence");
89                 else if(result>0x10FFFF)
90                         result=error("Invalid Unicode code point");
91
92                 i=j;
93                 return result;
94         }
95         else
96                 return *i++;
97 }
98
99 } // namespace Codecs
100 } // namespace Msp