c6d1990d0b00e6ae36870e8b0af7978f09cdb9fa
[libs/core.git] / source / stringcodec / utf8.cpp
1 #include "utf8.h"
2
3 using namespace std;
4
5 namespace Msp {
6 namespace StringCodec {
7
8 void Utf8::Encoder::encode_char(unichar ch, string &buf)
9 {
10         if(ch<0 || ch>0x10FFFF)
11                 return error(ch, buf, invalid_character(ch, "UTF-8"));
12
13         unsigned bytes = 1;
14         if(ch>0xFFFF)
15                 bytes = 4;
16         else if(ch>0x7FF)
17                 bytes = 3;
18         else if(ch>0x7F)
19                 bytes = 2;
20
21         if(bytes==1)
22                 buf += ch;
23         else
24         {
25                 char utf[4];
26
27                 utf[0] = 0xFF<<(8-bytes) | ch>>(bytes*6-6);
28                 for(unsigned j=bytes-1; j>0; --j)
29                 {
30                         utf[j] = 0x80 | (ch&0x3F);
31                         ch >>= 6;
32                 }
33
34                 buf.append(utf, bytes);
35         }
36 }
37
38 void Utf8::Encoder::transliterate(unichar, string &buf)
39 {
40         buf.append("\357\277\275", 3);  // � U+FFFD Replacement Character
41 }
42
43
44 unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
45 {
46         if(i==str.end())
47                 return -1;
48
49         if((*i&0xC0)==0x80)
50         {
51                 unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte"));
52                 ++i;
53                 return result;
54         }
55         else if(*i&0x80)
56         {
57                 unsigned bytes = 2;
58                 unsigned mask = 0x20;
59                 for(; *i&mask; mask>>=1)
60                         ++bytes;
61
62                 string::const_iterator j = i;
63
64                 unichar result = (*j++)&(mask-1);
65
66                 unsigned k;
67                 for(k=1; (k<bytes && j!=str.end() && (*j&0xC0)==0x80); ++k)
68                         result = (result<<6) | ((*j++)&0x3F);
69
70                 if(k<bytes)
71                         result = error(invalid_sequence(i, j, "incomplete UTF-8 character"));
72                 else if(!(result>>(bytes*5-4)) || !(result>>7))
73                         result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence"));
74                 else if(!is_valid_unichar(result))
75                         result = error(invalid_sequence(i, j, "undefined UTF-8 character"));
76
77                 i = j;
78                 return result;
79         }
80         else
81                 return *i++;
82 }
83
84 } // namespace StringCodec
85 } // namespace Msp