]> git.tdb.fi Git - libs/core.git/blobdiff - source/strings/regex.cpp
Style fix: add whitespace
[libs/core.git] / source / strings / regex.cpp
index f45c820c80ddadf87b6e621645f02cd965ac6cdc..67157c9b185e434adf995ec75eeb3f122869ea9a 100644 (file)
@@ -1,14 +1,7 @@
-/* $Id$
-
-This file is part of libmspstrings
-Copyright © 2007 Mikko Rasa
-Distributed under the LGPL
-*/
-
-#include <stack>
 #include <limits>
-#include <msp/core/except.h>
-#include "formatter.h"
+#include <list>
+#include <stack>
+#include "format.h"
 #include "regex.h"
 
 using namespace std;
@@ -20,16 +13,16 @@ template<typename T>
 void write_int(T n, Msp::Regex::Code &code)
 {
        for(unsigned i=0; i<sizeof(T); ++i)
-               code += (n>>i*8)&0xFF;
+               code += (n>>(i*8))&0xFF;
 }
 
-/** Reads an integer from a Regex code stream, in little-endian order. */
+/** Reads an integer from a Regex code string, in little-endian order. */
 template<typename T>
 T read_int(Msp::Regex::Code::const_iterator &c)
 {
        T result = 0;
        for(unsigned i=0; i<sizeof(T); ++i)
-               result += static_cast<unsigned char>(*c++)<<i*8;
+               result += (*c++)<<(i*8);
        return result;
 }
 
@@ -38,6 +31,28 @@ T read_int(Msp::Regex::Code::const_iterator &c)
 
 namespace Msp {
 
+bad_regex::bad_regex(const string &w, const string &e, const string::const_iterator &i):
+       logic_error(w+"\n"+make_where(e, i))
+{ }
+
+string bad_regex::make_where(const string &e, const string::const_iterator &i)
+{
+       string result;
+       string::size_type offset = i-e.begin();
+       if(offset>40)
+       {
+               result = e.substr(offset-40, 60);
+               offset = 40;
+       }
+       else
+               result = e.substr(0, 60);
+       result += '\n';
+       result.append(offset, ' ');
+       result += '^';
+       return result;
+}
+
+
 Regex::Regex(const string &expr)
 {
        n_groups = 0;
@@ -49,7 +64,7 @@ Regex::Regex(const string &expr)
 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
 {
        bool has_branches = false;
-       unsigned level = 0;
+       stack<string::const_iterator> parens;
        bool escape = false;
        unsigned bracket = 0;
        string::const_iterator end;
@@ -69,19 +84,19 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns
                else if(*end=='\\')
                        escape = true;
                else if(*end=='(')
-                       ++level;
+                       parens.push(end);
                else if(*end==')')
                {
-                       if(level==0)
+                       if(parens.empty())
                        {
                                if(group==0)
-                                       throw InvalidParameterValue("Unexpected )");
+                                       throw bad_regex("unmatched ')'", expr, end);
                                else
                                        break;
                        }
-                       --level;
+                       parens.pop();
                }
-               else if(*end=='|' && level==0)
+               else if(*end=='|' && parens.empty())
                {
                        if(branch)
                                break;
@@ -92,8 +107,8 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns
                        bracket = 1;
        }
 
-       if(level>0)
-               throw InvalidParameterValue("Unmatched (");
+       if(!parens.empty())
+               throw bad_regex("unmatched '('", expr, parens.top());
 
        Code result;
 
@@ -114,7 +129,7 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns
 
                        Count repeat_min = 1;
                        Count repeat_max = 1;
-                       parse_repeat(i, repeat_min, repeat_max);
+                       parse_repeat(expr, i, repeat_min, repeat_max);
 
                        for(unsigned j=0; j<repeat_min; ++j)
                                result += atom;
@@ -196,14 +211,14 @@ Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, uns
        if(*i=='\\')
        {
                if(++i==expr.end())
-                       throw InvalidParameterValue("Stray backslash");
+                       throw bad_regex("stray backslash", expr, i-1);
                flag = true;
        }
 
        if(!flag)
        {
                if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
-                       throw InvalidParameterValue("Invalid atom");
+                       throw bad_regex("invalid atom", expr, i);
                else if(*i=='[')
                        return parse_brackets(expr, i);
                else if(*i=='.')
@@ -232,7 +247,7 @@ Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, uns
        return result;
 }
 
-bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
+bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax)
 {
        if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
                return false;
@@ -243,6 +258,8 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
                rmin = 0;
        if(*i=='{')
        {
+               string::const_iterator begin = i;
+
                rmin = 0;
                for(++i; isdigit(*i); ++i)
                        rmin = rmin*10+(*i-'0');
@@ -256,7 +273,7 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
                                for(; isdigit(*i); ++i)
                                        rmax = rmax*10+(*i-'0');
                                if(rmax<rmin)
-                                       throw InvalidParameterValue("Invalid bound");
+                                       throw bad_regex("invalid bound", expr, begin);
                        }
                        else
                                rmax = numeric_limits<Count>::max();
@@ -264,7 +281,7 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
                else
                        rmax = rmin;
                if(*i!='}')
-                       throw InvalidParameterValue("Invalid bound");
+                       throw bad_regex("invalid bound", expr, begin);
        }
 
        ++i;
@@ -274,6 +291,7 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
 
 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
 {
+       string::const_iterator begin = iter;
        Code result;
 
        ++iter;
@@ -287,12 +305,12 @@ Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &ite
        string::const_iterator end = iter;
        for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
        if(end==str.end())
-               throw InvalidParameterValue("Unmatched '['");
+               throw bad_regex("unmatched '['", str, begin);
 
        unsigned char mask[32] = {0};
        unsigned type = 0;
        bool range = false;
-       unsigned char first=0, last = 0;
+       unsigned char first = 0, last = 0;
        for(string::const_iterator i=iter; i!=end; ++i)
        {
                unsigned char c = *i;
@@ -335,7 +353,7 @@ Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &ite
        else
        {
                result += MATCH_MASK;
-               result.append(reinterpret_cast<char *>(mask), 32);
+               result.append(mask, 32);
        }
 
        iter = end;
@@ -367,7 +385,7 @@ bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch
        {
                int c;
                if(i!=str.end())
-                       c = static_cast<unsigned char>(*i);
+                       c = *i;
                else
                        c = -1;
 
@@ -458,7 +476,7 @@ bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch
                                                input_consumed = true;
                                        }
                                        else
-                                               throw Exception("Invalid instruction");
+                                               throw logic_error("invalid instruction in regex bytecode");
 
                                        if(match_result==negate_match)
                                                terminate = true;
@@ -504,7 +522,7 @@ bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2)
 
 string Regex::disassemble() const
 {
-       ostringstream ss;
+       string result;
 
        for(Code::const_iterator i=code.begin(); i!=code.end();)
        {
@@ -513,77 +531,70 @@ string Regex::disassemble() const
                string decompiled = disassemble_instruction(i);
                string bytes;
                for(; j!=i; ++j)
-                       bytes += format(" %02X", static_cast<int>(*j)&0xFF);
-               ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
+                       bytes += format(" %02X", *j);
+               result += format("%3d:%-9s  ", offset, bytes);
                if(bytes.size()>9)
-                       ss<<"\n"<<Fmt("%15s");
-               ss<<"  "<<decompiled<<'\n';
+                       result += "\n               ";
+               result += decompiled;
+               result += '\n';
        }
 
-       return ss.str();
+       return result;
 }
 
 string Regex::disassemble_instruction(Code::const_iterator &i) const
 {
        Instruction instr = static_cast<Instruction>(*i++);
 
-       ostringstream result;
        switch(instr)
        {
        case JUMP:
                {
                        Offset offset = read_int<Offset>(i);
-                       result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+                       return format("JUMP %+d (%d)", offset, (i-code.begin())+offset);
                }
-               break;
        case ND_JUMP:
                {
                        Offset offset = read_int<Offset>(i);
-                       result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+                       return format("ND_JUMP %+d (%d)", offset, (i-code.begin())+offset);
                }
-               break;
        case GROUP_BEGIN:
-               result<<"GROUP_BEGIN "<<read_int<Index>(i);
-               break;
+               return format("GROUP_BEGIN %d", read_int<Index>(i));
        case GROUP_END:
-               result<<"GROUP_END "<<read_int<Index>(i);
-               break;
+               return format("GROUP_END %d", read_int<Index>(i));
        case NEGATE:
-               result<<"NEGATE";
-               break;
+               return "NEGATE";
        case MATCH_BEGIN:
-               result<<"MATCH_BEGIN";
-               break;
+               return "MATCH_BEGIN";
        case MATCH_END:
-               result<<"MATCH_END";
-               break;
+               return "MATCH_END";
        case MATCH_CHAR:
                {
-                       char c = *i++;
-                       result<<"MATCH_CHAR ";
+                       unsigned char c = *i++;
                        if(c>=0x20 && c<=0x7E)
-                               result<<'\''<<c<<'\'';
+                               return format("MATCH_CHAR '%c'", c);
                        else
-                               result<<(static_cast<int>(c)&0xFF);
+                               return format("MATCH_CHAR %d", c);
                }
                break;
        case MATCH_RANGE:
-               result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
-               result<<'-'<<(static_cast<int>(*i++)&0xFF);
-               break;
+               {
+                       int begin = *i++;
+                       int end = *i++;
+                       return format("MATCH_RANGE %d-%d", begin, end);
+               }
        case MATCH_MASK:
-               result<<"MATCH_MASK";
-               for(unsigned j=0; j<32; ++j)
-                       result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
-               break;
+               {
+                       string result = "MATCH_MASK";
+                       for(unsigned j=0; j<32; ++j)
+                               result += format(" %02X", *i++);
+                       return result;
+               }
        case MATCH_ANY:
-               result<<"MATCH_ANY";
-               break;
+               return "MATCH_ANY";
        default:
-               result<<"UNKNOWN "<<instr;
+               return format("UNKNOWN %d", instr);
        }
-
-       return result.str();
 }
 
 } // namespace Msp