More sophisticated error handling
authorMikko Rasa <tdb@tdb.fi>
Thu, 19 Apr 2007 08:55:12 +0000 (08:55 +0000)
committerMikko Rasa <tdb@tdb.fi>
Thu, 19 Apr 2007 08:55:12 +0000 (08:55 +0000)
A couple of win32 fixes

16 files changed:
source/ascii.cpp
source/ascii.h
source/codec.cpp
source/codec.h
source/iso2022jp.cpp
source/iso2022jp.h
source/iso646fi.cpp
source/iso646fi.h
source/jisx0201.cpp
source/jisx0201.h
source/jisx0208.cpp
source/jisx0208.h
source/latin1.cpp
source/latin1.h
source/utf8.cpp
source/utf8.h

index 10eb7949b713d14e49d621b568cd205760a21aa7..43f7053173a21e1ac5cc3b79bc3cdad3def303de 100644 (file)
@@ -4,11 +4,14 @@ using namespace std;
 
 namespace Msp {
 
-void Ascii::Encoder::encode_char(wchar_t c)
+void Ascii::Encoder::encode_char(wchar_t c_)
 {
+       // Win32 has typedef unsigned short wchar_t
+       int c=c_;
        if(c<0 || c>0x7F)
-               throw CodecError("Can't express character in ASCII");
-       append(c);
+               error("Can't express character in ASCII");
+       else
+               append(c);
 }
 
 
@@ -16,9 +19,13 @@ void Ascii::Decoder::decode_char(const string &str, string::const_iterator &i)
 {
        if(i==str.end())
                return;
-       if(*i&0x80)
-               throw CodecError("Invalid ASCII string (undefined character)");
-       append(*i++);
+       else if(*i&0x80)
+       {
+               error("Invalid ASCII string (undefined character)");
+               ++i;
+       }
+       else
+               append(*i++);
 }
 
 } // namespace Msp
index 9322a8b0db1c4b904f93a19a8037ef773783622d..de4013d9c45b5c04c51aab6bc9a4ce767c718209 100644 (file)
@@ -11,17 +11,21 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
                void encode_char(wchar_t);
+       private:
+               void append_replacement() { append(032); }
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
+               Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
                void decode_char(const std::string &, std::string::const_iterator &);
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 } // namespace Msp
index efc981492f18a08d766eb0c39932548adecfd8c5..38ef4ab932d3a3760038f0c6f0ec4bc3bf1626f8 100644 (file)
@@ -25,6 +25,7 @@ bool StringCodec::detect(const string &str) const
        {
                for(string::const_iterator i=str.begin(); i!=str.end(); )
                        dec->decode_char(str, i);
+               dec->sync();
        }
        catch(const CodecError &)
        {
@@ -36,6 +37,26 @@ bool StringCodec::detect(const string &str) const
        return result;
 }
 
+void StringCodec::Encoder::error(const string &msg)
+{
+       switch(err_mode_)
+       {
+       case IGNORE_ERRORS: break;
+       case REPLACE_ERRORS: append_replacement(); break;
+       default: throw CodecError(msg);
+       }
+}
+
+void StringCodec::Decoder::error(const string &msg)
+{
+       switch(err_mode_)
+       {
+       case IGNORE_ERRORS: break;
+       case REPLACE_ERRORS: append(0xFFFD); break;
+       default: throw CodecError(msg);
+       }
+}
+
 /**
 Creates a codec for the given encoding.  The caller is responsible for deleting
 the codec when it's no longer needed.
index 5fb29db29b305577582e6ed56c58f3bda61d823c..7acd88f89008e6e50b72333f4437172907dd7b55 100644 (file)
@@ -28,6 +28,13 @@ with it.
 class StringCodec
 {
 public:
+       enum ErrorMode
+       {
+               THROW_ON_ERROR,
+               IGNORE_ERRORS,
+               REPLACE_ERRORS
+       };
+
        /**
        Base class for string encoder.  Each codec class should contain an Encoder
        class derived from this.
@@ -71,11 +78,14 @@ public:
 
                virtual ~Encoder() { }
        protected:
-               Encoder() { }
+               Encoder(ErrorMode em=THROW_ON_ERROR): err_mode_(em) { }
                void append(char c) { buffer_+=c; }
                void append(const char *s, unsigned l) { buffer_.append(s, l); }
                void append(const std::string &s) { buffer_+=s; }
+               void error(const std::string &);
+               virtual void append_replacement() { }
        private:
+               ErrorMode err_mode_;
                std::string buffer_;
        };
 
@@ -92,30 +102,26 @@ public:
 
                /**
                Ensures that all input has been processed.  If this is not the case any
-               buffers are cleared and an exception is thrown,
+               buffers are cleared and an error is triggered.
                */
                virtual void sync() { }
 
-               /**
-               Resets the decoder, clearing a possibly erroneus state.  Does not flush
-               the internal buffer.
-               */
-               virtual void reset() { }
-
                const std::wstring &get() const { return buffer_; }
                unsigned size() const { return buffer_.size(); }
                void flush() { buffer_.clear(); }
                virtual ~Decoder() { }
        protected:
-               Decoder() { }
+               Decoder(ErrorMode em): err_mode_(em) { }
                void append(wchar_t c) { buffer_+=c; }
                void append(const std::wstring &s) { buffer_+=s; }
+               void error(const std::string &);
        private:
+               ErrorMode err_mode_;
                std::wstring buffer_;
        };
 
-       virtual Encoder *create_encoder() const =0;
-       virtual Decoder *create_decoder() const =0;
+       virtual Encoder *create_encoder(ErrorMode =THROW_ON_ERROR) const =0;
+       virtual Decoder *create_decoder(ErrorMode =THROW_ON_ERROR) const =0;
        virtual bool    detect(const std::string &) const;
        virtual ~StringCodec() { }
 protected:
index ab31456245b449c861a4c69b98dab62af77d4f40..5a51a648b3e9e5012154aa6195ebda2cb95412b1 100644 (file)
@@ -7,8 +7,11 @@ using namespace std;
 
 namespace Msp {
 
-void Iso2022Jp::Encoder::encode_char(wchar_t c)
+void Iso2022Jp::Encoder::encode_char(wchar_t c_)
 {
+       // Win32 has typedef unsigned short wchar_t
+       int c=c_;
+
        if(c>=0 && c<=0x7F && c!=0x5C && c!=0x7E)
        {
                if(mode!=ASCII && mode!=JISX0201)
@@ -34,12 +37,15 @@ void Iso2022Jp::Encoder::encode_char(wchar_t c)
        {
                unsigned short jis=ucs_to_jisx0208(c);
                if(!jis)
-                       throw CodecError("Can't express character in ISO-2022-JP");
-               if(mode!=JISX0208)
-                       switch_mode(JISX0208);
+                       error("Can't express character in ISO-2022-JP");
+               else
+               {
+                       if(mode!=JISX0208)
+                               switch_mode(JISX0208);
 
-               char buf[2]={jis>>8, jis};
-               append(buf, 2);
+                       char buf[2]={jis>>8, jis};
+                       append(buf, 2);
+               }
        }
 }
 
@@ -60,7 +66,15 @@ void Iso2022Jp::Encoder::switch_mode(Mode m)
        }
 }
 
-Iso2022Jp::Decoder::Decoder():
+void Iso2022Jp::Encoder::append_replacement()
+{
+       if(mode!=ASCII)
+               switch_mode(ASCII);
+       append(032);
+}
+
+Iso2022Jp::Decoder::Decoder(ErrorMode em):
+       StringCodec::Decoder(em),
        mode(ASCII),
        dec(new Ascii::Decoder),
        escape(0)
@@ -81,7 +95,7 @@ void Iso2022Jp::Decoder::decode_char(const string &str, string::const_iterator &
                                case 0x1B284A: switch_mode(JISX0201); break; // ESC ( J
                                case 0x1B2440:                               // ESC $ @
                                case 0x1B2442: switch_mode(JISX0208); break; // ESC $ B
-                               default: throw CodecError("Invalid ISO-2022-JP escape sequence");
+                               default: error("Invalid ISO-2022-JP escape sequence");
                                }
                                escape=0;
                        }
@@ -103,17 +117,21 @@ void Iso2022Jp::Decoder::decode_char(const string &str, string::const_iterator &
 void Iso2022Jp::Decoder::sync()
 {
        if(escape)
-               throw CodecError("Sync in middle of ISO-2022-JP escape sequence");
+       {
+               error("Sync in middle of ISO-2022-JP escape sequence");
+               escape=0;
+       }
+       
        if(mode!=ASCII)
-               throw CodecError("Sync while not in ASCII mode");
-       append(dec->get());
-       dec->flush();
-}
-
-void Iso2022Jp::Decoder::reset()
-{
-       switch_mode(ASCII);
-       escape=0;
+       {
+               error("Sync while not in ASCII mode");
+               switch_mode(ASCII);
+       }
+       else
+       {
+               append(dec->get());
+               dec->flush();
+       }
 }
 
 void Iso2022Jp::Decoder::switch_mode(Mode m)
index 8ded0d845d2c87200577351acb57ded2466c9f2d..82803b67e61446de3950575027d1a8c29232e6d4 100644 (file)
@@ -18,22 +18,22 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
-               Encoder(): mode(ASCII) { }
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em), mode(ASCII) { }
                void encode_char(wchar_t);
                void sync();
        private:
                Mode mode;
 
                void switch_mode(Mode);
+               void append_replacement();
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
-               Decoder();
+               Decoder(ErrorMode =THROW_ON_ERROR);
                void decode_char(const std::string &, std::string::const_iterator &);
                void sync();
-               void reset();
        private:
                Mode mode;
                StringCodec::Decoder *dec;
@@ -42,8 +42,8 @@ public:
                void switch_mode(Mode);
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 } // namespace Msp
index 45c29ffb5dab08d1edc80075152af9b12c1b826f..171c0a0a82780c1642d46c90e8d2b7b466bb68c2 100644 (file)
@@ -4,8 +4,11 @@ using namespace std;
 
 namespace Msp {
 
-void Iso646Fi::Encoder::encode_char(wchar_t c)
+void Iso646Fi::Encoder::encode_char(wchar_t c_)
 {
+       // Win32 has typedef unsigned short wchar_t
+       int c=c_;
+
        if((c>=0 && c<=0x5A) || c==0x5F || (c>=0x61 && c<=0x7A))
                append(c);
        else if(c==0xC4)
@@ -27,7 +30,7 @@ void Iso646Fi::Encoder::encode_char(wchar_t c)
        else if(c==0xFC)
                append(0x7E);
        else
-               throw CodecError("Can't express character in ISO-646-FI");
+               error("Can't express character in ISO-646-FI");
 }
 
 void Iso646Fi::Decoder::decode_char(const string &str, string::const_iterator &i)
@@ -57,7 +60,7 @@ void Iso646Fi::Decoder::decode_char(const string &str, string::const_iterator &i
        else if(c<=0x7F)
                append(c);
        else
-               throw CodecError("Invalid ISO-646-FI string (undefined character)");
+               error("Invalid ISO-646-FI string (undefined character)");
 }
 
 }
index 885015a1abd3169cbf28872b986cb7e9406aa0f2..1f0df917af6ea0d8d46478f84744cd29716c3799 100644 (file)
@@ -11,17 +11,21 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
                void encode_char(wchar_t);
+       private:
+               void append_replacement() { append(032); }
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
+               Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
                void decode_char(const std::string &, std::string::const_iterator &);
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 } // namespace Msp
index 0a8804759aaca8d3d95ab674067f01b4fa442520..65abff25e8f35125cc297c12f9a235a7c239d077 100644 (file)
@@ -4,8 +4,10 @@ using namespace std;
 
 namespace Msp {
 
-void JisX0201::Encoder::encode_char(wchar_t c)
+void JisX0201::Encoder::encode_char(wchar_t c_)
 {
+       // Win32 has typedef unsigned short wchar_t
+       int c=c_;
        if(c>=0 && c<=0x7F && c!=0x5C && c!=0x7E)
                append(c);
        else if(c==0xA5)
@@ -15,7 +17,7 @@ void JisX0201::Encoder::encode_char(wchar_t c)
        else if(c>=0xFF61 && c<=0xFF9F)
                append(c-0xFEC0);
        else
-               throw CodecError("Can't express character in JIS X 0201");
+               error("Can't express character in JIS X 0201");
 }
 
 void JisX0201::Decoder::decode_char(const string &str, string::const_iterator &i)
@@ -33,7 +35,7 @@ void JisX0201::Decoder::decode_char(const string &str, string::const_iterator &i
        else if(c>=0xA1 && c<=0xDF)
                append(c+0xFEC0);
        else
-               throw CodecError("Invalid JIS X 0201 string (undefined character)");
+               error("Invalid JIS X 0201 string (undefined character)");
 }
 
 } // namespace Msp
index 976a9456f708a150a2d699b2b3c990fe37da535b..bf513f771c9cae822d6ba915ea8a834b90437bfd 100644 (file)
@@ -11,17 +11,21 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
                void encode_char(wchar_t);
+       private:
+               void append_replacement() { append(032); }
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
+               Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
                void decode_char(const std::string &, std::string::const_iterator &);
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 } // namespace Msp
index 4d46f709f89df5f739aa7c11a6a298505f41ed45..e24673bf4cff30548cad58f4734e1dcbbfe3d574 100644 (file)
@@ -9,10 +9,13 @@ namespace Msp {
 void JisX0208::Encoder::encode_char(wchar_t ucs)
 {
        unsigned short jis=ucs_to_jisx0208(ucs);
-       if(!jis) throw CodecError("Can't express character in JIS X 0208");
-
-       char buf[2]={jis>>8, jis};
-       append(buf, 2);
+       if(jis)
+       {
+               char buf[2]={jis>>8, jis};
+               append(buf, 2);
+       }
+       else
+               error("Can't express character in JIS X 0208");
 }
 
 
@@ -30,21 +33,19 @@ void JisX0208::Decoder::decode_char(const string &str, string::const_iterator &i
        wchar_t ucs=jisx0208_to_ucs(high<<8 | *i++);
        high=0;
 
-       if(!ucs)
-               throw CodecError("Invalid JIS X 0208 string (undefined character)");
-
-       append(ucs);
+       if(ucs)
+               append(ucs);
+       else
+               error("Invalid JIS X 0208 string (undefined character)");
 }
 
 void JisX0208::Decoder::sync()
 {
        if(high)
-               throw CodecError("Sync in middle of JIS X 0208 character");
-}
-
-void JisX0208::Decoder::reset()
-{
-       high=0;
+       {
+               error("Sync in middle of JIS X 0208 character");
+               high=0;
+       }
 }
 
 wchar_t jisx0208_to_ucs(unsigned short jis)
index 16f82027247c4517d32097d74bca300af1d897ca..12dc2c0526a9e94a2e74e1a19edca1980e4445a2 100644 (file)
@@ -11,12 +11,16 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
                void encode_char(wchar_t);
+       private:
+               void append_replacement() { append("!)"); }
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
+               Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em), high(0) { }
                void decode_char(const std::string &, std::string::const_iterator &);
                void sync();
                void reset();
@@ -24,8 +28,8 @@ public:
                char high;
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 extern wchar_t jisx0208_to_ucs(unsigned short);
index fe9e1a4170e1893d86574d2f916c3b7722e5f369..319429f683103d671e73db0f0f34918c5ef32422 100644 (file)
@@ -4,11 +4,14 @@ using namespace std;
 
 namespace Msp {
 
-void Latin1::Encoder::encode_char(wchar_t c)
+void Latin1::Encoder::encode_char(wchar_t c_)
 {
+       // Win32 has typedef unsigned short wchar_t
+       int c=c_;
        if(c<0 || c>0xFF)
-               throw CodecError("Can't express character in Latin-1");
-       append(c);
+               error("Can't express character in Latin-1");
+       else
+               append(c);
 }
 
 void Latin1::Decoder::decode_char(const string &str, string::const_iterator &i)
index 20eb426d7ef41d3672bf29c130da29a866706b73..12d880cdc0926b6231c21ea1513f875d882fa375 100644 (file)
@@ -11,17 +11,21 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
                void encode_char(wchar_t);
+       private:
+               void append_replacement() { append(032); }
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
+               Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em) { }
                void decode_char(const std::string &, std::string::const_iterator &);
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 } // namespace Msp
index f95ad9c27a9ea1b76b86d273432f3c74b21b7fea..030406d3abcfc84dc32fd3d73a8812746d36d41d 100644 (file)
@@ -8,7 +8,10 @@ void Utf8::Encoder::encode_char(wchar_t c)
 {
        unsigned code=c;
        if(code>0x10FFFF)
-               throw CodecError("Can't express character in UTF-8");
+       {
+               error("Can't express character in UTF-8");
+               return;
+       }
 
        unsigned bytes=1;
        if(code>0xFFFF)
@@ -43,8 +46,11 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
                if(bytes==0)
                {
                        if((*i&0xC0)==0x80)
-                               throw CodecError("Invalid UTF-8 string (tail byte when expecting head)");
-
+                       {
+                               error("Invalid UTF-8 string (tail byte when expecting head)");
+                               ++i;
+                               break;
+                       }
                        else if(*i&0x80)
                        {
                                unsigned mask=0x40;
@@ -52,11 +58,20 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
                                        ++bytes;
 
                                if(bytes>3)
-                                       throw CodecError("Invalid UTF-8 string (overlong multibyte sequence)");
-
-                               code=(*i++)&(mask-1);
-                               if(!code)
-                                       throw CodecError("Invalid UTF-8 string (denormalized multibyte sequence)");
+                               {
+                                       error("Invalid UTF-8 string (overlong multibyte sequence)");
+                                       ++i;
+                                       break;
+                               }
+                               else
+                               {
+                                       code=(*i++)&(mask-1);
+                                       if(!code)
+                                       {
+                                               error("Invalid UTF-8 string (denormalized multibyte sequence)");
+                                               break;
+                                       }
+                               }
                        }
                        else
                        {
@@ -67,7 +82,11 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
                else
                {
                        if((*i&0xC0)!=0x80)
-                               throw CodecError("Invalid UTF-8 string (head byte when expecting tail)");
+                       {
+                               error("Invalid UTF-8 string (head byte when expecting tail)");
+                               ++i;
+                               break;
+                       }
 
                        code=code<<6 | (*i++)&0x3F;
                        --bytes;
@@ -75,8 +94,9 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
                        if(!bytes)
                        {
                                if(code>0x10FFFF)
-                                       throw CodecError("Invalid UTF-8 string (character out of range)");
-                               append(code);
+                                       error("Invalid UTF-8 string (character out of range)");
+                               else
+                                       append(code);
                                break;
                        }
                }
@@ -86,7 +106,10 @@ void Utf8::Decoder::decode_char(const string &str, string::const_iterator &i)
 void Utf8::Decoder::sync()
 {
        if(bytes)
-               throw CodecError("Sync in the middle of multibyte UTF-8 sequence");
+       {
+               error("Sync in the middle of multibyte UTF-8 sequence");
+               bytes=0;
+       }
 }
 
 void Utf8::Decoder::reset()
index 0475523cf52578216e9a63c4a4942ad1c18936e9..40b4fb88ea41eb54cd7ca12ac6c32171027b5a79 100644 (file)
@@ -11,13 +11,16 @@ public:
        class Encoder: public StringCodec::Encoder
        {
        public:
+               Encoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Encoder(em) { }
                void encode_char(wchar_t);
+       private:
+               void append_replacement() { append("\357\277\275"); }
        };
 
        class Decoder: public StringCodec::Decoder
        {
        public:
-               Decoder(): bytes(0), code(0) { }
+               Decoder(ErrorMode em=THROW_ON_ERROR): StringCodec::Decoder(em), bytes(0), code(0) { }
                void     decode_char(const std::string &, std::string::const_iterator &);
                void     sync();
                void     reset();
@@ -26,8 +29,8 @@ public:
                unsigned code;
        };
 
-       Encoder *create_encoder() const { return new Encoder; }
-       Decoder *create_decoder() const { return new Decoder; }
+       Encoder *create_encoder(ErrorMode em=THROW_ON_ERROR) const { return new Encoder(em); }
+       Decoder *create_decoder(ErrorMode em=THROW_ON_ERROR) const { return new Decoder(em); }
 };
 
 } // namespace Msp