namespace Msp {
-void Ascii::Encoder::encode_char(wchar_t c)
+void Ascii::Encoder::encode_char(wchar_t c_)
{
+ // Win32 has typedef unsigned short wchar_t
+ int c=c_;
if(c<0 || c>0x7F)
- throw CodecError("Can't express character in ASCII");
- append(c);
+ error("Can't express character in ASCII");
+ else
+ append(c);
}
{
if(i==str.end())
return;
- if(*i&0x80)
- throw CodecError("Invalid ASCII string (undefined character)");
- append(*i++);
+ else if(*i&0x80)
+ {
+ error("Invalid ASCII string (undefined character)");
+ ++i;
+ }
+ else
+ append(*i++);
}
} // namespace Msp
class Encoder: public StringCodec::Encoder
{
public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
void encode_char(wchar_t);
+ private:
+ void append_replacement() { append(032); }
};
class Decoder: public StringCodec::Decoder
{
public:
+ Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
void decode_char(const std::string &, std::string::const_iterator &);
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
} // namespace Msp
{
for(string::const_iterator i=str.begin(); i!=str.end(); )
dec->decode_char(str, i);
+ dec->sync();
}
catch(const CodecError &)
{
return result;
}
+void StringCodec::Encoder::error(const string &msg)
+{
+ switch(err_mode_)
+ {
+ case IGNORE_ERRORS: break;
+ case REPLACE_ERRORS: append_replacement(); break;
+ default: throw CodecError(msg);
+ }
+}
+
+void StringCodec::Decoder::error(const string &msg)
+{
+ switch(err_mode_)
+ {
+ case IGNORE_ERRORS: break;
+ case REPLACE_ERRORS: append(0xFFFD); break;
+ default: throw CodecError(msg);
+ }
+}
+
/**
Creates a codec for the given encoding. The caller is responsible for deleting
the codec when it's no longer needed.
class StringCodec
{
public:
+ enum ErrorMode
+ {
+ THROW_ON_ERROR,
+ IGNORE_ERRORS,
+ REPLACE_ERRORS
+ };
+
/**
Base class for string encoder. Each codec class should contain an Encoder
class derived from this.
virtual ~Encoder() { }
protected:
- Encoder() { }
+ Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { }
void append(char c) { buffer_+=c; }
void append(const char *s, unsigned l) { buffer_.append(s, l); }
void append(const std::string &s) { buffer_+=s; }
+ void error(const std::string &);
+ virtual void append_replacement() { }
private:
+ ErrorMode err_mode_;
std::string buffer_;
};
/**
Ensures that all input has been processed. If this is not the case any
- buffers are cleared and an exception is thrown,
+ buffers are cleared and an error is triggered.
*/
virtual void sync() { }
- /**
- Resets the decoder, clearing a possibly erroneus state. Does not flush
- the internal buffer.
- */
- virtual void reset() { }
-
const std::wstring &get() const { return buffer_; }
unsigned size() const { return buffer_.size(); }
void flush() { buffer_.clear(); }
virtual ~Decoder() { }
protected:
- Decoder() { }
+ Decoder(ErrorMode em): err_mode_(em) { }
void append(wchar_t c) { buffer_+=c; }
void append(const std::wstring &s) { buffer_+=s; }
+ void error(const std::string &);
private:
+ ErrorMode err_mode_;
std::wstring buffer_;
};
- virtual Encoder *create_encoder() const =0;
- virtual Decoder *create_decoder() const =0;
+ virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0;
+ virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0;
virtual bool detect(const std::string &) const;
virtual ~StringCodec() { }
protected:
namespace Msp {
-void Iso2022Jp::Encoder::encode_char(wchar_t c)
+void Iso2022Jp::Encoder::encode_char(wchar_t c_)
{
+ // Win32 has typedef unsigned short wchar_t
+ int c=c_;
+
if(c>=0 && c<=0x7F && c!=0x5C && c!=0x7E)
{
if(mode!=ASCII && mode!=JISX0201)
{
unsigned short jis=ucs_to_jisx0208(c);
if(!jis)
- throw CodecError("Can't express character in ISO-2022-JP");
- if(mode!=JISX0208)
- switch_mode(JISX0208);
+ error("Can't express character in ISO-2022-JP");
+ else
+ {
+ if(mode!=JISX0208)
+ switch_mode(JISX0208);
- char buf[2]={jis>>8, jis};
- append(buf, 2);
+ char buf[2]={jis>>8, jis};
+ append(buf, 2);
+ }
}
}
}
}
-Iso2022Jp::Decoder::Decoder():
+void Iso2022Jp::Encoder::append_replacement()
+{
+ if(mode!=ASCII)
+ switch_mode(ASCII);
+ append(032);
+}
+
+Iso2022Jp::Decoder::Decoder(ErrorMode em):
+ StringCodec::Decoder(em),
mode(ASCII),
dec(new Ascii::Decoder),
escape(0)
case 0x1B284A: switch_mode(JISX0201); break; // ESC ( J
case 0x1B2440: // ESC $ @
case 0x1B2442: switch_mode(JISX0208); break; // ESC $ B
- default: throw CodecError("Invalid ISO-2022-JP escape sequence");
+ default: error("Invalid ISO-2022-JP escape sequence");
}
escape=0;
}
void Iso2022Jp::Decoder::sync()
{
if(escape)
- throw CodecError("Sync in middle of ISO-2022-JP escape sequence");
+ {
+ error("Sync in middle of ISO-2022-JP escape sequence");
+ escape=0;
+ }
+
if(mode!=ASCII)
- throw CodecError("Sync while not in ASCII mode");
- append(dec->get());
- dec->flush();
-}
-
-void Iso2022Jp::Decoder::reset()
-{
- switch_mode(ASCII);
- escape=0;
+ {
+ error("Sync while not in ASCII mode");
+ switch_mode(ASCII);
+ }
+ else
+ {
+ append(dec->get());
+ dec->flush();
+ }
}
void Iso2022Jp::Decoder::switch_mode(Mode m)
class Encoder: public StringCodec::Encoder
{
public:
- Encoder(): mode(ASCII) { }
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em), mode(ASCII) { }
void encode_char(wchar_t);
void sync();
private:
Mode mode;
void switch_mode(Mode);
+ void append_replacement();
};
class Decoder: public StringCodec::Decoder
{
public:
- Decoder();
+ Decoder(ErrorMode =THROW_ON_ERROR);
void decode_char(const std::string &, std::string::const_iterator &);
void sync();
- void reset();
private:
Mode mode;
StringCodec::Decoder *dec;
void switch_mode(Mode);
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
} // namespace Msp
namespace Msp {
-void Iso646Fi::Encoder::encode_char(wchar_t c)
+void Iso646Fi::Encoder::encode_char(wchar_t c_)
{
+ // Win32 has typedef unsigned short wchar_t
+ int c=c_;
+
if((c>=0 && c<=0x5A) || c==0x5F || (c>=0x61 && c<=0x7A))
append(c);
else if(c==0xC4)
else if(c==0xFC)
append(0x7E);
else
- throw CodecError("Can't express character in ISO-646-FI");
+ error("Can't express character in ISO-646-FI");
}
void Iso646Fi::Decoder::decode_char(const string &str, string::const_iterator &i)
else if(c<=0x7F)
append(c);
else
- throw CodecError("Invalid ISO-646-FI string (undefined character)");
+ error("Invalid ISO-646-FI string (undefined character)");
}
}
class Encoder: public StringCodec::Encoder
{
public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
void encode_char(wchar_t);
+ private:
+ void append_replacement() { append(032); }
};
class Decoder: public StringCodec::Decoder
{
public:
+ Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
void decode_char(const std::string &, std::string::const_iterator &);
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
} // namespace Msp
namespace Msp {
-void JisX0201::Encoder::encode_char(wchar_t c)
+void JisX0201::Encoder::encode_char(wchar_t c_)
{
+ // Win32 has typedef unsigned short wchar_t
+ int c=c_;
if(c>=0 && c<=0x7F && c!=0x5C && c!=0x7E)
append(c);
else if(c==0xA5)
else if(c>=0xFF61 && c<=0xFF9F)
append(c-0xFEC0);
else
- throw CodecError("Can't express character in JIS X 0201");
+ error("Can't express character in JIS X 0201");
}
void JisX0201::Decoder::decode_char(const string &str, string::const_iterator &i)
else if(c>=0xA1 && c<=0xDF)
append(c+0xFEC0);
else
- throw CodecError("Invalid JIS X 0201 string (undefined character)");
+ error("Invalid JIS X 0201 string (undefined character)");
}
} // namespace Msp
class Encoder: public StringCodec::Encoder
{
public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
void encode_char(wchar_t);
+ private:
+ void append_replacement() { append(032); }
};
class Decoder: public StringCodec::Decoder
{
public:
+ Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
void decode_char(const std::string &, std::string::const_iterator &);
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
} // namespace Msp
void JisX0208::Encoder::encode_char(wchar_t ucs)
{
unsigned short jis=ucs_to_jisx0208(ucs);
- if(!jis) throw CodecError("Can't express character in JIS X 0208");
-
- char buf[2]={jis>>8, jis};
- append(buf, 2);
+ if(jis)
+ {
+ char buf[2]={jis>>8, jis};
+ append(buf, 2);
+ }
+ else
+ error("Can't express character in JIS X 0208");
}
wchar_t ucs=jisx0208_to_ucs(high<<8 | *i++);
high=0;
- if(!ucs)
- throw CodecError("Invalid JIS X 0208 string (undefined character)");
-
- append(ucs);
+ if(ucs)
+ append(ucs);
+ else
+ error("Invalid JIS X 0208 string (undefined character)");
}
void JisX0208::Decoder::sync()
{
if(high)
- throw CodecError("Sync in middle of JIS X 0208 character");
-}
-
-void JisX0208::Decoder::reset()
-{
- high=0;
+ {
+ error("Sync in middle of JIS X 0208 character");
+ high=0;
+ }
}
wchar_t jisx0208_to_ucs(unsigned short jis)
class Encoder: public StringCodec::Encoder
{
public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
void encode_char(wchar_t);
+ private:
+ void append_replacement() { append("!)"); }
};
class Decoder: public StringCodec::Decoder
{
public:
+ Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em), high(0) { }
void decode_char(const std::string &, std::string::const_iterator &);
void sync();
void reset();
char high;
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
extern wchar_t jisx0208_to_ucs(unsigned short);
namespace Msp {
-void Latin1::Encoder::encode_char(wchar_t c)
+void Latin1::Encoder::encode_char(wchar_t c_)
{
+ // Win32 has typedef unsigned short wchar_t
+ int c=c_;
if(c<0 || c>0xFF)
- throw CodecError("Can't express character in Latin-1");
- append(c);
+ error("Can't express character in Latin-1");
+ else
+ append(c);
}
void Latin1::Decoder::decode_char(const string &str, string::const_iterator &i)
class Encoder: public StringCodec::Encoder
{
public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
void encode_char(wchar_t);
+ private:
+ void append_replacement() { append(032); }
};
class Decoder: public StringCodec::Decoder
{
public:
+ Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
void decode_char(const std::string &, std::string::const_iterator &);
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
} // namespace Msp
{
unsigned code=c;
if(code>0x10FFFF)
- throw CodecError("Can't express character in UTF-8");
+ {
+ error("Can't express character in UTF-8");
+ return;
+ }
unsigned bytes=1;
if(code>0xFFFF)
if(bytes==0)
{
if((*i&0xC0)==0x80)
- throw CodecError("Invalid UTF-8 string (tail byte when expecting head)");
-
+ {
+ error("Invalid UTF-8 string (tail byte when expecting head)");
+ ++i;
+ break;
+ }
else if(*i&0x80)
{
unsigned mask=0x40;
++bytes;
if(bytes>3)
- throw CodecError("Invalid UTF-8 string (overlong multibyte sequence)");
-
- code=(*i++)&(mask-1);
- if(!code)
- throw CodecError("Invalid UTF-8 string (denormalized multibyte sequence)");
+ {
+ error("Invalid UTF-8 string (overlong multibyte sequence)");
+ ++i;
+ break;
+ }
+ else
+ {
+ code=(*i++)&(mask-1);
+ if(!code)
+ {
+ error("Invalid UTF-8 string (denormalized multibyte sequence)");
+ break;
+ }
+ }
}
else
{
else
{
if((*i&0xC0)!=0x80)
- throw CodecError("Invalid UTF-8 string (head byte when expecting tail)");
+ {
+ error("Invalid UTF-8 string (head byte when expecting tail)");
+ ++i;
+ break;
+ }
code=code<<6 | (*i++)&0x3F;
--bytes;
if(!bytes)
{
if(code>0x10FFFF)
- throw CodecError("Invalid UTF-8 string (character out of range)");
- append(code);
+ error("Invalid UTF-8 string (character out of range)");
+ else
+ append(code);
break;
}
}
void Utf8::Decoder::sync()
{
if(bytes)
- throw CodecError("Sync in the middle of multibyte UTF-8 sequence");
+ {
+ error("Sync in the middle of multibyte UTF-8 sequence");
+ bytes=0;
+ }
}
void Utf8::Decoder::reset()
class Encoder: public StringCodec::Encoder
{
public:
+ Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
void encode_char(wchar_t);
+ private:
+ void append_replacement() { append("\357\277\275"); }
};
class Decoder: public StringCodec::Decoder
{
public:
- Decoder(): bytes(0), code(0) { }
+ Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em), bytes(0), code(0) { }
void decode_char(const std::string &, std::string::const_iterator &);
void sync();
void reset();
unsigned code;
};
- Encoder *create_encoder() const { return new Encoder; }
- Decoder *create_decoder() const { return new Decoder; }
+ Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+ Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
};
} // namespace Msp