void Utf8::Encoder::encode_char(unichar ch, string &buf)
{
- if(ch<0 || ch>0x10FFFF)
- return error(ch, buf, "Can't express character in UTF-8");
+ if(!is_valid_unichar(ch))
+ return error(ch, buf, invalid_character(ch, "UTF-8"));
unsigned bytes = 1;
if(ch>0xFFFF)
void Utf8::Encoder::transliterate(unichar, string &buf)
{
- buf.append("\357\277\275", 3); // � U+FFFE Replacement Character
+ buf.append("\357\277\275", 3); // � U+FFFD Replacement Character
}
unichar Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
{
if(i==str.end())
- return error("No input");
+ return -1;
if((*i&0xC0)==0x80)
{
- unichar result = error("UTF-8 tail byte found when expecting head");
+ unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte"));
++i;
return result;
}
result = (result<<6) | ((*j++)&0x3F);
if(k<bytes)
- result = error("Incomplete UTF-8 character");
+ result = error(invalid_sequence(i, j, "incomplete UTF-8 character"));
else if(!(result>>(bytes*5-4)) || !(result>>7))
- result = error("Denormalized UTF-8 multibyte sequence");
- else if(result>0x10FFFF || (result>=0xD800 && result<=0xDFFF))
- result = error("Invalid Unicode code point");
+ result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence"));
+ else if(!is_valid_unichar(result))
+ result = error(invalid_sequence(i, j, "undefined UTF-8 character"));
i = j;
return result;