From d2118ac101602cfe2d62fb7deb6ef3fcb0fe137b Mon Sep 17 00:00:00 2001 From: Mikko Rasa Date: Thu, 19 Apr 2007 08:55:12 +0000 Subject: [PATCH] More sophisticated error handling A couple of win32 fixes --- source/ascii.cpp | 19 +++++++++++----- source/ascii.h | 8 +++++-- source/codec.cpp | 21 +++++++++++++++++ source/codec.h | 28 ++++++++++++++--------- source/iso2022jp.cpp | 54 +++++++++++++++++++++++++++++--------------- source/iso2022jp.h | 10 ++++---- source/iso646fi.cpp | 9 +++++--- source/iso646fi.h | 8 +++++-- source/jisx0201.cpp | 8 ++++--- source/jisx0201.h | 8 +++++-- source/jisx0208.cpp | 29 ++++++++++++------------ source/jisx0208.h | 8 +++++-- source/latin1.cpp | 9 +++++--- source/latin1.h | 8 +++++-- source/utf8.cpp | 47 ++++++++++++++++++++++++++++---------- source/utf8.h | 9 +++++--- 16 files changed, 195 insertions(+), 88 deletions(-) diff --git a/source/ascii.cpp b/source/ascii.cpp index 10eb794..43f7053 100644 --- a/source/ascii.cpp +++ b/source/ascii.cpp @@ -4,11 +4,14 @@ using namespace std; namespace Msp { -void Ascii::Encoder::encode_char(wchar_t c) +void Ascii::Encoder::encode_char(wchar_t c_) { + // Win32 has typedef unsigned short wchar_t + int c=c_; if(c<0 || c>0x7F) - throw CodecError("Can't express character in ASCII"); - append(c); + error("Can't express character in ASCII"); + else + append(c); } @@ -16,9 +19,13 @@ void Ascii::Decoder::decode_char(const string &str, string::const_iterator &i) { if(i==str.end()) return; - if(*i&0x80) - throw CodecError("Invalid ASCII string (undefined character)"); - append(*i++); + else if(*i&0x80) + { + error("Invalid ASCII string (undefined character)"); + ++i; + } + else + append(*i++); } } // namespace Msp diff --git a/source/ascii.h b/source/ascii.h index 9322a8b..de4013d 100644 --- a/source/ascii.h +++ b/source/ascii.h @@ -11,17 +11,21 @@ public: class Encoder: public StringCodec::Encoder { public: + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { } void encode_char(wchar_t); + private: + void append_replacement() { append(032); } }; class Decoder: public StringCodec::Decoder { public: + Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { } void decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Msp diff --git a/source/codec.cpp b/source/codec.cpp index efc9814..38ef4ab 100644 --- a/source/codec.cpp +++ b/source/codec.cpp @@ -25,6 +25,7 @@ bool StringCodec::detect(const string &str) const { for(string::const_iterator i=str.begin(); i!=str.end(); ) dec->decode_char(str, i); + dec->sync(); } catch(const CodecError &) { @@ -36,6 +37,26 @@ bool StringCodec::detect(const string &str) const return result; } +void StringCodec::Encoder::error(const string &msg) +{ + switch(err_mode_) + { + case IGNORE_ERRORS: break; + case REPLACE_ERRORS: append_replacement(); break; + default: throw CodecError(msg); + } +} + +void StringCodec::Decoder::error(const string &msg) +{ + switch(err_mode_) + { + case IGNORE_ERRORS: break; + case REPLACE_ERRORS: append(0xFFFD); break; + default: throw CodecError(msg); + } +} + /** Creates a codec for the given encoding. The caller is responsible for deleting the codec when it's no longer needed. diff --git a/source/codec.h b/source/codec.h index 5fb29db..7acd88f 100644 --- a/source/codec.h +++ b/source/codec.h @@ -28,6 +28,13 @@ with it. class StringCodec { public: + enum ErrorMode + { + THROW_ON_ERROR, + IGNORE_ERRORS, + REPLACE_ERRORS + }; + /** Base class for string encoder. Each codec class should contain an Encoder class derived from this. @@ -71,11 +78,14 @@ public: virtual ~Encoder() { } protected: - Encoder() { } + Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { } void append(char c) { buffer_+=c; } void append(const char *s, unsigned l) { buffer_.append(s, l); } void append(const std::string &s) { buffer_+=s; } + void error(const std::string &); + virtual void append_replacement() { } private: + ErrorMode err_mode_; std::string buffer_; }; @@ -92,30 +102,26 @@ public: /** Ensures that all input has been processed. If this is not the case any - buffers are cleared and an exception is thrown, + buffers are cleared and an error is triggered. */ virtual void sync() { } - /** - Resets the decoder, clearing a possibly erroneus state. Does not flush - the internal buffer. - */ - virtual void reset() { } - const std::wstring &get() const { return buffer_; } unsigned size() const { return buffer_.size(); } void flush() { buffer_.clear(); } virtual ~Decoder() { } protected: - Decoder() { } + Decoder(ErrorMode em): err_mode_(em) { } void append(wchar_t c) { buffer_+=c; } void append(const std::wstring &s) { buffer_+=s; } + void error(const std::string &); private: + ErrorMode err_mode_; std::wstring buffer_; }; - virtual Encoder *create_encoder() const =0; - virtual Decoder *create_decoder() const =0; + virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0; + virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0; virtual bool detect(const std::string &) const; virtual ~StringCodec() { } protected: diff --git a/source/iso2022jp.cpp b/source/iso2022jp.cpp index ab31456..5a51a64 100644 --- a/source/iso2022jp.cpp +++ b/source/iso2022jp.cpp @@ -7,8 +7,11 @@ using namespace std; namespace Msp { -void Iso2022Jp::Encoder::encode_char(wchar_t c) +void Iso2022Jp::Encoder::encode_char(wchar_t c_) { + // Win32 has typedef unsigned short wchar_t + int c=c_; + if(c>=0 && c<=0x7F && c!=0x5C && c!=0x7E) { if(mode!=ASCII && mode!=JISX0201) @@ -34,12 +37,15 @@ void Iso2022Jp::Encoder::encode_char(wchar_t c) { unsigned short jis=ucs_to_jisx0208(c); if(!jis) - throw CodecError("Can't express character in ISO-2022-JP"); - if(mode!=JISX0208) - switch_mode(JISX0208); + error("Can't express character in ISO-2022-JP"); + else + { + if(mode!=JISX0208) + switch_mode(JISX0208); - char buf[2]={jis>>8, jis}; - append(buf, 2); + char buf[2]={jis>>8, jis}; + append(buf, 2); + } } } @@ -60,7 +66,15 @@ void Iso2022Jp::Encoder::switch_mode(Mode m) } } -Iso2022Jp::Decoder::Decoder(): +void Iso2022Jp::Encoder::append_replacement() +{ + if(mode!=ASCII) + switch_mode(ASCII); + append(032); +} + +Iso2022Jp::Decoder::Decoder(ErrorMode em): + StringCodec::Decoder(em), mode(ASCII), dec(new Ascii::Decoder), escape(0) @@ -81,7 +95,7 @@ void Iso2022Jp::Decoder::decode_char(const string &str, string::const_iterator & case 0x1B284A: switch_mode(JISX0201); break; // ESC ( J case 0x1B2440: // ESC $ @ case 0x1B2442: switch_mode(JISX0208); break; // ESC $ B - default: throw CodecError("Invalid ISO-2022-JP escape sequence"); + default: error("Invalid ISO-2022-JP escape sequence"); } escape=0; } @@ -103,17 +117,21 @@ void Iso2022Jp::Decoder::decode_char(const string &str, string::const_iterator & void Iso2022Jp::Decoder::sync() { if(escape) - throw CodecError("Sync in middle of ISO-2022-JP escape sequence"); + { + error("Sync in middle of ISO-2022-JP escape sequence"); + escape=0; + } + if(mode!=ASCII) - throw CodecError("Sync while not in ASCII mode"); - append(dec->get()); - dec->flush(); -} - -void Iso2022Jp::Decoder::reset() -{ - switch_mode(ASCII); - escape=0; + { + error("Sync while not in ASCII mode"); + switch_mode(ASCII); + } + else + { + append(dec->get()); + dec->flush(); + } } void Iso2022Jp::Decoder::switch_mode(Mode m) diff --git a/source/iso2022jp.h b/source/iso2022jp.h index 8ded0d8..82803b6 100644 --- a/source/iso2022jp.h +++ b/source/iso2022jp.h @@ -18,22 +18,22 @@ public: class Encoder: public StringCodec::Encoder { public: - Encoder(): mode(ASCII) { } + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em), mode(ASCII) { } void encode_char(wchar_t); void sync(); private: Mode mode; void switch_mode(Mode); + void append_replacement(); }; class Decoder: public StringCodec::Decoder { public: - Decoder(); + Decoder(ErrorMode =THROW_ON_ERROR); void decode_char(const std::string &, std::string::const_iterator &); void sync(); - void reset(); private: Mode mode; StringCodec::Decoder *dec; @@ -42,8 +42,8 @@ public: void switch_mode(Mode); }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Msp diff --git a/source/iso646fi.cpp b/source/iso646fi.cpp index 45c29ff..171c0a0 100644 --- a/source/iso646fi.cpp +++ b/source/iso646fi.cpp @@ -4,8 +4,11 @@ using namespace std; namespace Msp { -void Iso646Fi::Encoder::encode_char(wchar_t c) +void Iso646Fi::Encoder::encode_char(wchar_t c_) { + // Win32 has typedef unsigned short wchar_t + int c=c_; + if((c>=0 && c<=0x5A) || c==0x5F || (c>=0x61 && c<=0x7A)) append(c); else if(c==0xC4) @@ -27,7 +30,7 @@ void Iso646Fi::Encoder::encode_char(wchar_t c) else if(c==0xFC) append(0x7E); else - throw CodecError("Can't express character in ISO-646-FI"); + error("Can't express character in ISO-646-FI"); } void Iso646Fi::Decoder::decode_char(const string &str, string::const_iterator &i) @@ -57,7 +60,7 @@ void Iso646Fi::Decoder::decode_char(const string &str, string::const_iterator &i else if(c<=0x7F) append(c); else - throw CodecError("Invalid ISO-646-FI string (undefined character)"); + error("Invalid ISO-646-FI string (undefined character)"); } } diff --git a/source/iso646fi.h b/source/iso646fi.h index 885015a..1f0df91 100644 --- a/source/iso646fi.h +++ b/source/iso646fi.h @@ -11,17 +11,21 @@ public: class Encoder: public StringCodec::Encoder { public: + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { } void encode_char(wchar_t); + private: + void append_replacement() { append(032); } }; class Decoder: public StringCodec::Decoder { public: + Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { } void decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Msp diff --git a/source/jisx0201.cpp b/source/jisx0201.cpp index 0a88047..65abff2 100644 --- a/source/jisx0201.cpp +++ b/source/jisx0201.cpp @@ -4,8 +4,10 @@ using namespace std; namespace Msp { -void JisX0201::Encoder::encode_char(wchar_t c) +void JisX0201::Encoder::encode_char(wchar_t c_) { + // Win32 has typedef unsigned short wchar_t + int c=c_; if(c>=0 && c<=0x7F && c!=0x5C && c!=0x7E) append(c); else if(c==0xA5) @@ -15,7 +17,7 @@ void JisX0201::Encoder::encode_char(wchar_t c) else if(c>=0xFF61 && c<=0xFF9F) append(c-0xFEC0); else - throw CodecError("Can't express character in JIS X 0201"); + error("Can't express character in JIS X 0201"); } void JisX0201::Decoder::decode_char(const string &str, string::const_iterator &i) @@ -33,7 +35,7 @@ void JisX0201::Decoder::decode_char(const string &str, string::const_iterator &i else if(c>=0xA1 && c<=0xDF) append(c+0xFEC0); else - throw CodecError("Invalid JIS X 0201 string (undefined character)"); + error("Invalid JIS X 0201 string (undefined character)"); } } // namespace Msp diff --git a/source/jisx0201.h b/source/jisx0201.h index 976a945..bf513f7 100644 --- a/source/jisx0201.h +++ b/source/jisx0201.h @@ -11,17 +11,21 @@ public: class Encoder: public StringCodec::Encoder { public: + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { } void encode_char(wchar_t); + private: + void append_replacement() { append(032); } }; class Decoder: public StringCodec::Decoder { public: + Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { } void decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Msp diff --git a/source/jisx0208.cpp b/source/jisx0208.cpp index 4d46f70..e24673b 100644 --- a/source/jisx0208.cpp +++ b/source/jisx0208.cpp @@ -9,10 +9,13 @@ namespace Msp { void JisX0208::Encoder::encode_char(wchar_t ucs) { unsigned short jis=ucs_to_jisx0208(ucs); - if(!jis) throw CodecError("Can't express character in JIS X 0208"); - - char buf[2]={jis>>8, jis}; - append(buf, 2); + if(jis) + { + char buf[2]={jis>>8, jis}; + append(buf, 2); + } + else + error("Can't express character in JIS X 0208"); } @@ -30,21 +33,19 @@ void JisX0208::Decoder::decode_char(const string &str, string::const_iterator &i wchar_t ucs=jisx0208_to_ucs(high<<8 | *i++); high=0; - if(!ucs) - throw CodecError("Invalid JIS X 0208 string (undefined character)"); - - append(ucs); + if(ucs) + append(ucs); + else + error("Invalid JIS X 0208 string (undefined character)"); } void JisX0208::Decoder::sync() { if(high) - throw CodecError("Sync in middle of JIS X 0208 character"); -} - -void JisX0208::Decoder::reset() -{ - high=0; + { + error("Sync in middle of JIS X 0208 character"); + high=0; + } } wchar_t jisx0208_to_ucs(unsigned short jis) diff --git a/source/jisx0208.h b/source/jisx0208.h index 16f8202..12dc2c0 100644 --- a/source/jisx0208.h +++ b/source/jisx0208.h @@ -11,12 +11,16 @@ public: class Encoder: public StringCodec::Encoder { public: + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { } void encode_char(wchar_t); + private: + void append_replacement() { append("!)"); } }; class Decoder: public StringCodec::Decoder { public: + Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em), high(0) { } void decode_char(const std::string &, std::string::const_iterator &); void sync(); void reset(); @@ -24,8 +28,8 @@ public: char high; }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; extern wchar_t jisx0208_to_ucs(unsigned short); diff --git a/source/latin1.cpp b/source/latin1.cpp index fe9e1a4..319429f 100644 --- a/source/latin1.cpp +++ b/source/latin1.cpp @@ -4,11 +4,14 @@ using namespace std; namespace Msp { -void Latin1::Encoder::encode_char(wchar_t c) +void Latin1::Encoder::encode_char(wchar_t c_) { + // Win32 has typedef unsigned short wchar_t + int c=c_; if(c<0 || c>0xFF) - throw CodecError("Can't express character in Latin-1"); - append(c); + error("Can't express character in Latin-1"); + else + append(c); } void Latin1::Decoder::decode_char(const string &str, string::const_iterator &i) diff --git a/source/latin1.h b/source/latin1.h index 20eb426..12d880c 100644 --- a/source/latin1.h +++ b/source/latin1.h @@ -11,17 +11,21 @@ public: class Encoder: public StringCodec::Encoder { public: + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { } void encode_char(wchar_t); + private: + void append_replacement() { append(032); } }; class Decoder: public StringCodec::Decoder { public: + Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { } void decode_char(const std::string &, std::string::const_iterator &); }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Msp diff --git a/source/utf8.cpp b/source/utf8.cpp index f95ad9c..030406d 100644 --- a/source/utf8.cpp +++ b/source/utf8.cpp @@ -8,7 +8,10 @@ void Utf8::Encoder::encode_char(wchar_t c) { unsigned code=c; if(code>0x10FFFF) - throw CodecError("Can't express character in UTF-8"); + { + error("Can't express character in UTF-8"); + return; + } unsigned bytes=1; if(code>0xFFFF) @@ -43,8 +46,11 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) if(bytes==0) { if((*i&0xC0)==0x80) - throw CodecError("Invalid UTF-8 string (tail byte when expecting head)"); - + { + error("Invalid UTF-8 string (tail byte when expecting head)"); + ++i; + break; + } else if(*i&0x80) { unsigned mask=0x40; @@ -52,11 +58,20 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) ++bytes; if(bytes>3) - throw CodecError("Invalid UTF-8 string (overlong multibyte sequence)"); - - code=(*i++)&(mask-1); - if(!code) - throw CodecError("Invalid UTF-8 string (denormalized multibyte sequence)"); + { + error("Invalid UTF-8 string (overlong multibyte sequence)"); + ++i; + break; + } + else + { + code=(*i++)&(mask-1); + if(!code) + { + error("Invalid UTF-8 string (denormalized multibyte sequence)"); + break; + } + } } else { @@ -67,7 +82,11 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) else { if((*i&0xC0)!=0x80) - throw CodecError("Invalid UTF-8 string (head byte when expecting tail)"); + { + error("Invalid UTF-8 string (head byte when expecting tail)"); + ++i; + break; + } code=code<<6 | (*i++)&0x3F; --bytes; @@ -75,8 +94,9 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) if(!bytes) { if(code>0x10FFFF) - throw CodecError("Invalid UTF-8 string (character out of range)"); - append(code); + error("Invalid UTF-8 string (character out of range)"); + else + append(code); break; } } @@ -86,7 +106,10 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i) void Utf8::Decoder::sync() { if(bytes) - throw CodecError("Sync in the middle of multibyte UTF-8 sequence"); + { + error("Sync in the middle of multibyte UTF-8 sequence"); + bytes=0; + } } void Utf8::Decoder::reset() diff --git a/source/utf8.h b/source/utf8.h index 0475523..40b4fb8 100644 --- a/source/utf8.h +++ b/source/utf8.h @@ -11,13 +11,16 @@ public: class Encoder: public StringCodec::Encoder { public: + Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { } void encode_char(wchar_t); + private: + void append_replacement() { append("\357\277\275"); } }; class Decoder: public StringCodec::Decoder { public: - Decoder(): bytes(0), code(0) { } + Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em), bytes(0), code(0) { } void decode_char(const std::string &, std::string::const_iterator &); void sync(); void reset(); @@ -26,8 +29,8 @@ public: unsigned code; }; - Encoder *create_encoder() const { return new Encoder; } - Decoder *create_decoder() const { return new Decoder; } + Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); } + Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); } }; } // namespace Msp -- 2.45.2