-/* $Id$
-
-This file is part of libmspstrings
-Copyright © 2007 Mikko Rasa
-Distributed under the LGPL
-*/
-
-#include <stack>
#include <limits>
+#include <list>
+#include <stack>
+#include <vector>
#include <msp/core/except.h>
-#include "formatter.h"
+#include "format.h"
#include "regex.h"
using namespace std;
/** Writes an integer to a Regex code string, in little-endian order. */
template<typename T>
-void write_int(T n, Msp::Regex::Code &code)
+void write_int(T n, basic_string<unsigned char> &code)
{
for(unsigned i=0; i<sizeof(T); ++i)
- code += (n>>i*8)&0xFF;
+ code += (n>>(i*8))&0xFF;
}
-/** Reads an integer from a Regex code stream, in little-endian order. */
+/** Reads an integer from a Regex code string, in little-endian order. */
template<typename T>
-T read_int(Msp::Regex::Code::const_iterator &c)
+T read_int(basic_string<unsigned char>::const_iterator &c)
{
T result = 0;
for(unsigned i=0; i<sizeof(T); ++i)
- result += static_cast<unsigned char>(*c++)<<i*8;
+ result += (*c++)<<(i*8);
return result;
}
namespace Msp {
+bad_regex::bad_regex(const string &w, const string &e, const string::const_iterator &i):
+ logic_error(w+"\n"+make_where(e, i))
+{ }
+
+string bad_regex::make_where(const string &e, const string::const_iterator &i)
+{
+ string result;
+ string::size_type offset = i-e.begin();
+ if(offset>40)
+ {
+ result = e.substr(offset-40, 60);
+ offset = 40;
+ }
+ else
+ result = e.substr(0, 60);
+ result += '\n';
+ result.append(offset, ' ');
+ result += '^';
+ return result;
+}
+
+
Regex::Regex(const string &expr)
{
- n_groups = 0;
- string::const_iterator iter = expr.begin();
+ auto iter = expr.begin();
code = compile(expr, iter, n_groups, false);
++n_groups;
}
Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
{
bool has_branches = false;
- unsigned level = 0;
+ stack<string::const_iterator> parens;
bool escape = false;
unsigned bracket = 0;
string::const_iterator end;
else if(*end=='\\')
escape = true;
else if(*end=='(')
- ++level;
+ parens.push(end);
else if(*end==')')
{
- if(level==0)
+ if(parens.empty())
{
if(group==0)
- throw InvalidParameterValue("Unexpected )");
+ throw bad_regex("unmatched ')'", expr, end);
else
break;
}
- --level;
+ parens.pop();
}
- else if(*end=='|' && level==0)
+ else if(*end=='|' && parens.empty())
{
if(branch)
break;
bracket = 1;
}
- if(level>0)
- throw InvalidParameterValue("Unmatched (");
+ if(!parens.empty())
+ throw bad_regex("unmatched '('", expr, parens.top());
Code result;
if(!has_branches)
{
- for(string::const_iterator i=iter; i!=end;)
+ for(auto i=iter; i!=end;)
{
Code atom = parse_atom(expr, i, group);
Count repeat_min = 1;
Count repeat_max = 1;
- parse_repeat(i, repeat_min, repeat_max);
+ if(i!=end)
+ parse_repeat(expr, i, repeat_min, repeat_max);
for(unsigned j=0; j<repeat_min; ++j)
result += atom;
result += atom;
}
result += ND_JUMP;
- write_int<Offset>(-(atom.size()+jump_size), result);
+ write_int<Offset>(-static_cast<Offset>(atom.size()+jump_size), result);
}
else if(repeat_max>repeat_min)
{
}
else
{
- list<Code> branches;
- for(string::const_iterator i=iter;;)
+ vector<Code> branches;
+ for(auto i=iter;;)
{
branches.push_back(compile(expr, i, group, true));
if(i==end)
unsigned n_branches = branches.size();
Offset offset = (n_branches-1)*jump_size+branches.front().size();
- for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
+ for(auto i=++branches.begin(); i!=branches.end(); ++i)
{
result += ND_JUMP;
write_int<Offset>(offset, result);
offset += i->size();
}
- for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
+ for(auto i=branches.begin(); i!=branches.end();)
{
result += *i;
offset -= i->size()+jump_size;
if(*i=='\\')
{
if(++i==expr.end())
- throw InvalidParameterValue("Stray backslash");
+ throw bad_regex("stray backslash", expr, i-1);
flag = true;
}
if(!flag)
{
if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
- throw InvalidParameterValue("Invalid atom");
+ throw bad_regex("invalid atom", expr, i);
else if(*i=='[')
return parse_brackets(expr, i);
else if(*i=='.')
return result;
}
-bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
+bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax)
{
if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
return false;
rmin = 0;
if(*i=='{')
{
+ auto begin = i;
+
rmin = 0;
for(++i; isdigit(*i); ++i)
rmin = rmin*10+(*i-'0');
for(; isdigit(*i); ++i)
rmax = rmax*10+(*i-'0');
if(rmax<rmin)
- throw InvalidParameterValue("Invalid bound");
+ throw bad_regex("invalid bound", expr, begin);
}
else
rmax = numeric_limits<Count>::max();
else
rmax = rmin;
if(*i!='}')
- throw InvalidParameterValue("Invalid bound");
+ throw bad_regex("invalid bound", expr, begin);
}
++i;
Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
{
+ auto begin = iter;
Code result;
++iter;
++iter;
}
- string::const_iterator end = iter;
+ auto end = iter;
for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
if(end==str.end())
- throw InvalidParameterValue("Unmatched '['");
+ throw bad_regex("unmatched '['", str, begin);
unsigned char mask[32] = {0};
unsigned type = 0;
bool range = false;
- unsigned char first=0, last = 0;
- for(string::const_iterator i=iter; i!=end; ++i)
+ unsigned char first = 0, last = 0;
+ for(auto i=iter; i!=end; ++i)
{
unsigned char c = *i;
if(range)
else
{
result += MATCH_MASK;
- result.append(reinterpret_cast<char *>(mask), 32);
+ result.append(mask, 32);
}
iter = end;
RegMatch Regex::match(const string &str) const
{
- RegMatch::GroupArray groups(n_groups);
+ vector<RegMatch::Group> groups(n_groups);
- for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
+ for(auto i=str.begin(); i!=str.end(); ++i)
if(run(str, i, groups))
return RegMatch(str, groups);
return RegMatch();
}
-bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
+bool Regex::run(const string &str, const string::const_iterator &begin, vector<RegMatch::Group> &groups) const
{
bool result = false;
list<RunContext> ctx;
ctx.front().citer = code.begin();
ctx.front().groups.resize(groups.size());
- for(string::const_iterator i=begin;;)
+ for(auto i=begin;;)
{
int c;
if(i!=str.end())
- c = static_cast<unsigned char>(*i);
+ c = *i;
else
c = -1;
- for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
+ for(auto j=ctx.begin(); j!=ctx.end();)
{
bool terminate = false;
bool negate_match = false;
input_consumed = true;
}
else
- throw Exception("Invalid instruction");
+ throw internal_error("invalid instruction in regex bytecode");
if(match_result==negate_match)
terminate = true;
// Earlier match is better
if(g1.begin<g2.begin)
return true;
- if(g2.begin>g2.begin)
+ if(g1.begin>g2.begin)
return false;
// Longer match at same position is better
string Regex::disassemble() const
{
- ostringstream ss;
+ string result;
- for(Code::const_iterator i=code.begin(); i!=code.end();)
+ for(auto i=code.begin(); i!=code.end();)
{
- Code::const_iterator j = i;
+ auto j = i;
Offset offset = i-code.begin();
string decompiled = disassemble_instruction(i);
string bytes;
for(; j!=i; ++j)
- bytes += format(" %02X", static_cast<int>(*j)&0xFF);
- ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
+ bytes += format(" %02X", *j);
+ result += format("%3d:%-9s ", offset, bytes);
if(bytes.size()>9)
- ss<<"\n"<<Fmt("%15s");
- ss<<" "<<decompiled<<'\n';
+ result += "\n ";
+ result += decompiled;
+ result += '\n';
}
- return ss.str();
+ return result;
}
string Regex::disassemble_instruction(Code::const_iterator &i) const
{
Instruction instr = static_cast<Instruction>(*i++);
- ostringstream result;
switch(instr)
{
case JUMP:
{
Offset offset = read_int<Offset>(i);
- result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+ return format("JUMP %+d (%d)", offset, (i-code.begin())+offset);
}
- break;
case ND_JUMP:
{
Offset offset = read_int<Offset>(i);
- result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+ return format("ND_JUMP %+d (%d)", offset, (i-code.begin())+offset);
}
- break;
case GROUP_BEGIN:
- result<<"GROUP_BEGIN "<<read_int<Index>(i);
- break;
+ return format("GROUP_BEGIN %d", read_int<Index>(i));
case GROUP_END:
- result<<"GROUP_END "<<read_int<Index>(i);
- break;
+ return format("GROUP_END %d", read_int<Index>(i));
case NEGATE:
- result<<"NEGATE";
- break;
+ return "NEGATE";
case MATCH_BEGIN:
- result<<"MATCH_BEGIN";
- break;
+ return "MATCH_BEGIN";
case MATCH_END:
- result<<"MATCH_END";
- break;
+ return "MATCH_END";
case MATCH_CHAR:
{
- char c = *i++;
- result<<"MATCH_CHAR ";
+ unsigned char c = *i++;
if(c>=0x20 && c<=0x7E)
- result<<'\''<<c<<'\'';
+ return format("MATCH_CHAR '%c'", c);
else
- result<<(static_cast<int>(c)&0xFF);
+ return format("MATCH_CHAR %d", c);
}
break;
case MATCH_RANGE:
- result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
- result<<'-'<<(static_cast<int>(*i++)&0xFF);
- break;
+ {
+ int begin = *i++;
+ int end = *i++;
+ return format("MATCH_RANGE %d-%d", begin, end);
+ }
case MATCH_MASK:
- result<<"MATCH_MASK";
- for(unsigned j=0; j<32; ++j)
- result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
- break;
+ {
+ string result = "MATCH_MASK";
+ for(unsigned j=0; j<32; ++j)
+ result += format(" %02X", *i++);
+ return result;
+ }
case MATCH_ANY:
- result<<"MATCH_ANY";
- break;
+ return "MATCH_ANY";
default:
- result<<"UNKNOWN "<<instr;
+ return format("UNKNOWN %d", instr);
}
-
- return result.str();
}
} // namespace Msp