-/* $Id$
-
-This file is part of libmspstrings
-Copyright © 2007 Mikko Rasa
-Distributed under the LGPL
-*/
-
-#include <stack>
#include <limits>
-#include <msp/core/except.h>
+#include <list>
+#include <stack>
#include "format.h"
#include "regex.h"
void write_int(T n, Msp::Regex::Code &code)
{
for(unsigned i=0; i<sizeof(T); ++i)
- code += (n>>i*8)&0xFF;
+ code += (n>>(i*8))&0xFF;
}
-/** Reads an integer from a Regex code stream, in little-endian order. */
+/** Reads an integer from a Regex code string, in little-endian order. */
template<typename T>
T read_int(Msp::Regex::Code::const_iterator &c)
{
T result = 0;
for(unsigned i=0; i<sizeof(T); ++i)
- result += static_cast<unsigned char>(*c++)<<i*8;
+ result += (*c++)<<(i*8);
return result;
}
namespace Msp {
+bad_regex::bad_regex(const string &w, const string &e, const string::const_iterator &i):
+ logic_error(w+"\n"+make_where(e, i))
+{ }
+
+string bad_regex::make_where(const string &e, const string::const_iterator &i)
+{
+ string result;
+ string::size_type offset = i-e.begin();
+ if(offset>40)
+ {
+ result = e.substr(offset-40, 60);
+ offset = 40;
+ }
+ else
+ result = e.substr(0, 60);
+ result += '\n';
+ result.append(offset, ' ');
+ result += '^';
+ return result;
+}
+
+
Regex::Regex(const string &expr)
{
n_groups = 0;
Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
{
bool has_branches = false;
- unsigned level = 0;
+ stack<string::const_iterator> parens;
bool escape = false;
unsigned bracket = 0;
string::const_iterator end;
else if(*end=='\\')
escape = true;
else if(*end=='(')
- ++level;
+ parens.push(end);
else if(*end==')')
{
- if(level==0)
+ if(parens.empty())
{
if(group==0)
- throw InvalidParameterValue("Unexpected )");
+ throw bad_regex("unmatched ')'", expr, end);
else
break;
}
- --level;
+ parens.pop();
}
- else if(*end=='|' && level==0)
+ else if(*end=='|' && parens.empty())
{
if(branch)
break;
bracket = 1;
}
- if(level>0)
- throw InvalidParameterValue("Unmatched (");
+ if(!parens.empty())
+ throw bad_regex("unmatched '('", expr, parens.top());
Code result;
Count repeat_min = 1;
Count repeat_max = 1;
- parse_repeat(i, repeat_min, repeat_max);
+ parse_repeat(expr, i, repeat_min, repeat_max);
for(unsigned j=0; j<repeat_min; ++j)
result += atom;
if(*i=='\\')
{
if(++i==expr.end())
- throw InvalidParameterValue("Stray backslash");
+ throw bad_regex("stray backslash", expr, i-1);
flag = true;
}
if(!flag)
{
if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
- throw InvalidParameterValue("Invalid atom");
+ throw bad_regex("invalid atom", expr, i);
else if(*i=='[')
return parse_brackets(expr, i);
else if(*i=='.')
return result;
}
-bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
+bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax)
{
if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
return false;
rmin = 0;
if(*i=='{')
{
+ string::const_iterator begin = i;
+
rmin = 0;
for(++i; isdigit(*i); ++i)
rmin = rmin*10+(*i-'0');
for(; isdigit(*i); ++i)
rmax = rmax*10+(*i-'0');
if(rmax<rmin)
- throw InvalidParameterValue("Invalid bound");
+ throw bad_regex("invalid bound", expr, begin);
}
else
rmax = numeric_limits<Count>::max();
else
rmax = rmin;
if(*i!='}')
- throw InvalidParameterValue("Invalid bound");
+ throw bad_regex("invalid bound", expr, begin);
}
++i;
Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
{
+ string::const_iterator begin = iter;
Code result;
++iter;
string::const_iterator end = iter;
for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
if(end==str.end())
- throw InvalidParameterValue("Unmatched '['");
+ throw bad_regex("unmatched '['", str, begin);
unsigned char mask[32] = {0};
unsigned type = 0;
else
{
result += MATCH_MASK;
- result.append(reinterpret_cast<char *>(mask), 32);
+ result.append(mask, 32);
}
iter = end;
{
int c;
if(i!=str.end())
- c = static_cast<unsigned char>(*i);
+ c = *i;
else
c = -1;
input_consumed = true;
}
else
- throw Exception("Invalid instruction");
+ throw logic_error("invalid instruction in regex bytecode");
if(match_result==negate_match)
terminate = true;
string Regex::disassemble() const
{
- ostringstream ss;
+ string result;
for(Code::const_iterator i=code.begin(); i!=code.end();)
{
string decompiled = disassemble_instruction(i);
string bytes;
for(; j!=i; ++j)
- bytes += format(" %02X", static_cast<int>(*j)&0xFF);
- ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
+ bytes += format(" %02X", *j);
+ result += format("%3d:%-9s ", offset, bytes);
if(bytes.size()>9)
- ss<<"\n"<<Fmt("%15s");
- ss<<" "<<decompiled<<'\n';
+ result += "\n ";
+ result += decompiled;
+ result += '\n';
}
- return ss.str();
+ return result;
}
string Regex::disassemble_instruction(Code::const_iterator &i) const
{
Instruction instr = static_cast<Instruction>(*i++);
- ostringstream result;
switch(instr)
{
case JUMP:
{
Offset offset = read_int<Offset>(i);
- result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+ return format("JUMP %+d (%d)", offset, (i-code.begin())+offset);
}
- break;
case ND_JUMP:
{
Offset offset = read_int<Offset>(i);
- result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+ return format("ND_JUMP %+d (%d)", offset, (i-code.begin())+offset);
}
- break;
case GROUP_BEGIN:
- result<<"GROUP_BEGIN "<<read_int<Index>(i);
- break;
+ return format("GROUP_BEGIN %d", read_int<Index>(i));
case GROUP_END:
- result<<"GROUP_END "<<read_int<Index>(i);
- break;
+ return format("GROUP_END %d", read_int<Index>(i));
case NEGATE:
- result<<"NEGATE";
- break;
+ return "NEGATE";
case MATCH_BEGIN:
- result<<"MATCH_BEGIN";
- break;
+ return "MATCH_BEGIN";
case MATCH_END:
- result<<"MATCH_END";
- break;
+ return "MATCH_END";
case MATCH_CHAR:
{
- char c = *i++;
- result<<"MATCH_CHAR ";
+ unsigned char c = *i++;
if(c>=0x20 && c<=0x7E)
- result<<'\''<<c<<'\'';
+ return format("MATCH_CHAR '%c'", c);
else
- result<<(static_cast<int>(c)&0xFF);
+ return format("MATCH_CHAR %d", c);
}
break;
case MATCH_RANGE:
- result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
- result<<'-'<<(static_cast<int>(*i++)&0xFF);
- break;
+ {
+ int begin = *i++;
+ int end = *i++;
+ return format("MATCH_RANGE %d-%d", begin, end);
+ }
case MATCH_MASK:
- result<<"MATCH_MASK";
- for(unsigned j=0; j<32; ++j)
- result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
- break;
+ {
+ string result = "MATCH_MASK";
+ for(unsigned j=0; j<32; ++j)
+ result += format(" %02X", *i++);
+ return result;
+ }
case MATCH_ANY:
- result<<"MATCH_ANY";
- break;
+ return "MATCH_ANY";
default:
- result<<"UNKNOWN "<<instr;
+ return format("UNKNOWN %d", instr);
}
-
- return result.str();
}
} // namespace Msp