void Ascii::Encoder::encode_char(unichar ch, string &buf)
{
if(ch<0 || ch>0x7F)
- return error(ch, buf, "Can't express character in ASCII");
+ return error(ch, buf, invalid_character(ch, "ASCII"));
buf += ch;
}
return -1;
else if(*i&0x80)
{
- unichar result = error("Undefined ASCII character");
+ unichar result = error(invalid_sequence(i, i+1, "undefined ASCII character"));
++i;
return result;
}
return buf;
}
-void Codec::Encoder::error(unichar ch, string &buf, const string &msg)
-{
- switch(err_mode)
- {
- case TRANSLITERATE:
- transliterate(ch, buf);
- case IGNORE_ERRORS:
- break;
- default:
- throw CodecError(msg);
- }
-}
void Codec::Decoder::decode(const string &str, ustring &buf)
return buf;
}
-unichar Codec::Decoder::error(const string &msg)
-{
- switch(err_mode)
- {
- case TRANSLITERATE:
- return 0xFFFD;
- case IGNORE_ERRORS:
- return -1;
- default:
- throw CodecError(msg);
- }
-}
-
Codec *create_codec(const string &n)
{
string name;
if(name=="jisx0208") return new JisX0208;
if(name=="utf8") return new Utf8;
if(name=="windows1252" || name=="cp1252") return new Windows1252;
- throw InvalidParameterValue("Unknown string codec");
+ throw invalid_argument("unknown string codec");
}
Codec *detect_codec(const string &str)
#define MSP_STRINGCODEC_CODEC_H_
#include <string>
-#include <msp/core/except.h>
+#include "except.h"
#include "ustring.h"
namespace Msp {
TRANSLITERATE
};
-/**
-An exception thrown for all kinds of problems encountered while encoding or
-decoding strings.
-*/
-class CodecError: public Exception
-{
-public:
- CodecError(const std::string &w_): Exception(w_) { }
-};
/**
Base class for string codecs. Use one of the derived classes or the function
protected:
/** Handles an error depending on the error mode.
- THROW_ON_ERROR: throws CodecError(msg)
+ THROW_ON_ERROR: throws err
IGNORE_ERRORS: does nothing
TRANSLITERATE: calls transliterate(ch, buf) */
- void error(unichar ch, std::string &buf, const std::string &msg);
+ template<typename E>
+ void error(unichar ch, std::string &buf, const E &err)
+ {
+ if(err_mode==TRANSLITERATE)
+ transliterate(ch, buf);
+ else if(err_mode!=IGNORE_ERRORS)
+ throw err;
+ }
/** Attempts to produce an alternative encoding for a unicode character.
Typically this includes dropping accent marks or romanizing letters. */
/** Handles an error depending on the error mode. The return value is
suitable for returning from decode_char.
- THROW_ON_ERROR: throws CodecError(msg)
+ THROW_ON_ERROR: throws err
IGNORE_ERRORS: returns -1
TRANSLITERATE: return 0xFFFD */
- unichar error(const std::string &msg);
+ template<typename E>
+ unichar error(const E &err)
+ {
+ if(err_mode==TRANSLITERATE)
+ return 0xFFFD;
+ else if(err_mode==IGNORE_ERRORS)
+ return -1;
+ else
+ throw err;
+ }
};
protected:
--- /dev/null
+#include <msp/strings/format.h>
+#include "except.h"
+
+using namespace std;
+
+namespace Msp {
+namespace StringCodec {
+
+invalid_character::invalid_character(unichar ch, const string &detail):
+ codec_error(format("invalid character: U+%04X (%s)", ch, detail))
+{ }
+
+
+invalid_sequence::invalid_sequence(const string::const_iterator &begin, const string::const_iterator &end, const string &detail):
+ codec_error(format("invalid sequence: %s (%s)", format_sequence(begin, end), detail))
+{ }
+
+string invalid_sequence::format_sequence(const string::const_iterator &begin, const string::const_iterator &end)
+{
+ string result;
+ for(string::const_iterator i=begin; i!=end; ++i)
+ {
+ if(!result.empty())
+ result += ' ';
+ result += lexical_cast(static_cast<unsigned char>(*i), Fmt().fill('0').width(2).hex().uppercase());
+ }
+ return result;
+}
+
+} // namespace StringCodec
+} // namespace Msp
--- /dev/null
+#ifndef MSP_STRINGCODEC_EXCEPT_H_
+#define MSP_STRINGCODEC_EXCEPT_H_
+
+#include <stdexcept>
+#include "ustring.h"
+
+namespace Msp {
+namespace StringCodec {
+
+/**
+Base class for codec errors.
+*/
+class codec_error: public std::runtime_error
+{
+public:
+ codec_error(const std::string &w): std::runtime_error(w) { }
+ virtual ~codec_error() throw() { }
+};
+
+
+/**
+Thrown when a codec can't encode the requested character.
+*/
+class invalid_character: public codec_error
+{
+public:
+ invalid_character(unichar, const std::string &);
+ virtual ~invalid_character() throw() { }
+};
+
+
+/**
+Thrown when a codec encounters a byte sequence it can't decode.
+*/
+class invalid_sequence: public codec_error
+{
+public:
+ invalid_sequence(const std::string::const_iterator &, const std::string::const_iterator &, const std::string &);
+ virtual ~invalid_sequence() throw() { }
+
+private:
+ std::string format_sequence(const std::string::const_iterator &, const std::string::const_iterator &);
+};
+
+} // namespace StringCodec
+} // namespace Msp
+
+#endif
{
Kuten jis = ucs_to_jisx0208(ch);
if(!jis)
- return error(ch, buf, "Can't express character in ISO-2022-JP");
+ return error(ch, buf, invalid_character(ch, "ISO-2022-JP"));
if(mode!=JISX0208)
switch_mode(JISX0208, buf);
- char jbuf[2] = {jis.ku+0x20, jis.ten+0x20};
+ char jbuf[2] = { jis.ku+0x20, jis.ten+0x20 };
buf.append(jbuf, 2);
}
}
case ASCII: buf.append("\033(B", 3); break;
case JISX0201: buf.append("\033(J", 3); break;
case JISX0208: buf.append("\033$B", 3); break;
- default: throw CodecError("WTF? Invalid mode in Iso2022Jp::Encoder::switch_mode");
+ default: throw invalid_argument("Iso2022Jp::Encoder::switch_mode");
}
}
else if(dec)
return dec->decode_char(str, i);
else
- throw CodecError("WTF? No sub-decoder for Iso2022Jp::Decoder");
+ throw logic_error("no sub-decoder");
if(result>=0)
return result;
{
int tch = transform_mapping_or_direct(mapping, map_size, ch, false);
if(tch<0 || tch>0x7F)
- error(ch, buf, "Can't express character in ISO-646-FI");
+ error(ch, buf, invalid_character(ch, "ISO-646-FI"));
else
buf += tch;
}
unsigned char ch = *i;
unichar result;
if(ch>=0x80)
- result = error("Undefined ISO-646-FI character");
+ result = error(invalid_sequence(i, i+1, "undefined ISO-646-FI character"));
else
result = transform_mapping_or_direct(mapping, map_size, ch, true);
void Iso88591::Encoder::encode_char(unichar ch, string &buf)
{
if(ch<0 || ch>0xFF)
- return error(ch, buf, "Can't express character in ISO-8859-1");
+ return error(ch, buf, invalid_character(ch, "ISO-8859-1"));
buf += ch;
}
{
int tch = transform_mapping_or_direct(mapping, map_size, ch, false);
if(tch<0 || tch>0xFF)
- error(ch, buf, "Can't express character in ISO-8859-15");
+ error(ch, buf, invalid_character(ch, "ISO-8859-15"));
else
buf += tch;
else if(ch>=0xFF61 && ch<=0xFF9F)
buf += ch-0xFEC0;
else
- error(ch, buf, "Can't express character in JIS X 0201");
+ error(ch, buf, invalid_character(ch, "JIS X 0201"));
}
void JisX0201::Encoder::transliterate(unichar, string &buf)
else if(ch>=0xA1 && ch<=0xDF)
result = ch+0xFEC0;
else
- result = error("Undefined JIS X 0201 character");
+ result = error(invalid_sequence(i, i+1, "undefined JIS X 0201 character"));
++i;
return result;
buf.append(jbuf, 2);
}
else
- error(ucs, buf, "Can't express character in JIS X 0208");
+ error(ucs, buf, invalid_character(ucs, "JIS X 0208"));
}
void JisX0208::Encoder::transliterate(unichar, string &buf)
unichar result;
if(j==str.end())
- result = error("Incomplete JIS X 0208 character");
+ result = error(invalid_sequence(i, j, "incomplete JIS X 0208 character"));
else
{
jis.ten = *j++-0x20;
result = jisx0208_to_ucs(jis);
+ if(result==-1)
+ result = error(invalid_sequence(i, j, "invalid JIS X 0208 ku-ten"));
if(result==0)
- result = error("Undefined JIS X 0208 character");
+ result = error(invalid_sequence(i, j, "undefined JIS X 0208 character"));
}
i = j;
unichar jisx0208_to_ucs(Kuten jis)
{
if(jis.ku==0 || jis.ku>0x5E || jis.ten==0 || jis.ten>0x5E)
- return 0;
+ return -1;
return jisx0208_to_ucs_table[jis.ku*94 + jis.ten - 95];
}
void Utf8::Encoder::encode_char(unichar ch, string &buf)
{
if(ch<0 || ch>0x10FFFF)
- return error(ch, buf, "Can't express character in UTF-8");
+ return error(ch, buf, invalid_character(ch, "UTF-8"));
unsigned bytes = 1;
if(ch>0xFFFF)
if((*i&0xC0)==0x80)
{
- unichar result = error("UTF-8 tail byte found when expecting head");
+ unichar result = error(invalid_sequence(i, i+1, "stray UTF-8 head byte"));
++i;
return result;
}
result = (result<<6) | ((*j++)&0x3F);
if(k<bytes)
- result = error("Incomplete UTF-8 character");
+ result = error(invalid_sequence(i, j, "incomplete UTF-8 character"));
else if(!(result>>(bytes*5-4)) || !(result>>7))
- result = error("Denormalized UTF-8 multibyte sequence");
+ result = error(invalid_sequence(i, j, "denormalized UTF-8 sequence"));
else if(!is_valid_unichar(result))
- result = error("Invalid Unicode code point");
+ result = error(invalid_sequence(i, j, "undefined UTF-8 character"));
i = j;
return result;
return;
}
- error(ch, buf, "Can't express character in Windows-1252");
+ error(ch, buf, invalid_character(ch, "Windows-1252"));
}
}
{
result = table[ch-0x80];
if(result==0)
- result = error("Undefined Windows-1252 character");
+ result = error(invalid_sequence(i, i+1, "undefined Windows-1252 character"));
}
else
result = ch;