Copyright © 2007 Mikko Rasa
Distributed under the LGPL
*/
+
#include <stack>
-#include <msp/core/error.h>
+#include <limits>
+#include <msp/core/except.h>
#include "formatter.h"
#include "regex.h"
namespace {
-/**
-Writes an integer to a Regex code string, in little-endian order.
-*/
+/** Writes an integer to a Regex code string, in little-endian order. */
template<typename T>
void write_int(T n, Msp::Regex::Code &code)
{
for(unsigned i=0; i<sizeof(T); ++i)
- code+=(n>>i*8)&0xFF;
+ code += (n>>i*8)&0xFF;
}
-/**
-Reads an integer from a Regex code stream, in little-endian order.
-*/
+/** Reads an integer from a Regex code stream, in little-endian order. */
template<typename T>
T read_int(Msp::Regex::Code::const_iterator &c)
{
- T result=0;
+ T result = 0;
for(unsigned i=0; i<sizeof(T); ++i)
- result+=static_cast<unsigned char>(*c++)<<i*8;
+ result += static_cast<unsigned char>(*c++)<<i*8;
return result;
}
}
+
namespace Msp {
Regex::Regex(const string &expr)
{
- n_groups=0;
- string::const_iterator iter=expr.begin();
- code=compile(expr, iter, n_groups, false);
+ n_groups = 0;
+ string::const_iterator iter = expr.begin();
+ code = compile(expr, iter, n_groups, false);
++n_groups;
}
-RegMatch Regex::match(const string &str) const
-{
- RegMatch::GroupArray groups(n_groups);
-
- for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
- if(run(str, i, groups))
- return RegMatch(str, groups);
-
- return RegMatch();
-}
-
-string Regex::disassemble() const
-{
- ostringstream ss;
-
- for(Code::const_iterator i=code.begin(); i!=code.end();)
- {
- Code::const_iterator j=i;
- Offset offset=i-code.begin();
- string decompiled=disassemble_instruction(i);
- string bytes;
- for(; j!=i; ++j)
- bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
- ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
- if(bytes.size()>9)
- ss<<"\n"<<Fmt("%15s");
- ss<<" "<<decompiled<<'\n';
- }
-
- return ss.str();
-}
-
Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
{
- bool has_branches=false;
- unsigned level=0;
- bool escape=false;
+ bool has_branches = false;
+ unsigned level = 0;
+ bool escape = false;
+ unsigned bracket = 0;
string::const_iterator end;
for(end=iter; end!=expr.end(); ++end)
{
if(escape)
- escape=false;
+ escape = false;
+ else if(bracket)
+ {
+ if(bracket==3 && *end==']')
+ bracket = 0;
+ else if(bracket==1 && *end=='^')
+ bracket = 2;
+ else
+ bracket = 3;
+ }
else if(*end=='\\')
- escape=true;
+ escape = true;
else if(*end=='(')
++level;
else if(*end==')')
}
--level;
}
- else if(*end=='|')
+ else if(*end=='|' && level==0)
{
if(branch)
break;
- else if(level==0)
- has_branches=true;
+ else
+ has_branches = true;
}
+ else if(*end=='[')
+ bracket = 1;
}
if(level>0)
Code result;
- unsigned this_group=group;
+ unsigned this_group = group;
if(!branch)
{
- result+=GROUP_BEGIN;
+ result += GROUP_BEGIN;
write_int<Index>(this_group, result);
}
- const unsigned jump_size=1+sizeof(Offset);
+ const unsigned jump_size = 1+sizeof(Offset);
if(!has_branches)
{
for(string::const_iterator i=iter; i!=end;)
{
- Code atom=parse_atom(expr, i, group);
+ Code atom = parse_atom(expr, i, group);
- Count repeat_min=1;
- Count repeat_max=1;
+ Count repeat_min = 1;
+ Count repeat_max = 1;
parse_repeat(i, repeat_min, repeat_max);
for(unsigned j=0; j<repeat_min; ++j)
- result+=atom;
+ result += atom;
if(repeat_max==numeric_limits<Count>::max())
{
if(repeat_min==0)
{
- result+=ND_JUMP;
+ result += ND_JUMP;
write_int<Offset>(atom.size()+jump_size, result);
- result+=atom;
+ result += atom;
}
- result+=ND_JUMP;
+ result += ND_JUMP;
write_int<Offset>(-(atom.size()+jump_size), result);
}
else if(repeat_max>repeat_min)
{
for(unsigned j=repeat_min; j<repeat_max; ++j)
{
- result+=ND_JUMP;
+ result += ND_JUMP;
write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
- result+=atom;
+ result += atom;
}
}
}
++i;
}
- unsigned n_branches=branches.size();
+ unsigned n_branches = branches.size();
- Offset offset=(n_branches-1)*jump_size+branches.front().size();
+ Offset offset = (n_branches-1)*jump_size+branches.front().size();
for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
{
- result+=ND_JUMP;
+ result += ND_JUMP;
write_int<Offset>(offset, result);
- offset+=i->size();
+ offset += i->size();
}
for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
{
- result+=*i;
- offset-=i->size()+jump_size;
+ result += *i;
+ offset -= i->size()+jump_size;
++i;
if(i!=branches.end())
{
- result+=JUMP;
+ result += JUMP;
write_int<Offset>(offset, result);
}
}
if(!branch)
{
- result+=GROUP_END;
+ result += GROUP_END;
write_int<Index>(this_group, result);
}
- iter=end;
+ iter = end;
return result;
}
if(i==expr.end())
return result;
- bool flag=false;
+ bool flag = false;
if(*i=='\\')
{
if(++i==expr.end())
throw InvalidParameterValue("Stray backslash");
- flag=true;
+ flag = true;
}
if(!flag)
else if(*i=='[')
return parse_brackets(expr, i);
else if(*i=='.')
- result+=MATCH_ANY;
+ result += MATCH_ANY;
else if(*i=='^')
- result+=MATCH_BEGIN;
+ result += MATCH_BEGIN;
else if(*i=='$')
- result+=MATCH_END;
+ result += MATCH_END;
else if(*i=='(')
{
++group;
- result=compile(expr, ++i, group, false);
+ result = compile(expr, ++i, group, false);
}
else
- flag=true;
+ flag = true;
}
if(flag)
{
- if(static_cast<unsigned char>(*i)<=LAST_INSTRUCTION_)
- result+=MATCH_CHAR;
- result+=*i;
+ result += MATCH_CHAR;
+ result += *i;
}
++i;
return false;
if(*i=='*' || *i=='+')
- rmax=numeric_limits<Count>::max();
+ rmax = numeric_limits<Count>::max();
if(*i=='*' || *i=='?')
- rmin=0;
+ rmin = 0;
if(*i=='{')
{
- rmin=0;
+ rmin = 0;
for(++i; isdigit(*i); ++i)
- rmin=rmin*10+(*i-'0');
+ rmin = rmin*10+(*i-'0');
if(*i==',')
{
++i;
if(*i!='}')
{
- rmax=0;
+ rmax = 0;
for(; isdigit(*i); ++i)
- rmax=rmax*10+(*i-'0');
+ rmax = rmax*10+(*i-'0');
if(rmax<rmin)
throw InvalidParameterValue("Invalid bound");
}
else
- rmax=numeric_limits<Count>::max();
+ rmax = numeric_limits<Count>::max();
}
else
- rmax=rmin;
+ rmax = rmin;
if(*i!='}')
throw InvalidParameterValue("Invalid bound");
}
Code result;
++iter;
- bool neg=false;
+ bool neg = false;
if(*iter=='^')
{
- neg=true;
+ neg = true;
++iter;
}
- string::const_iterator end=iter;
- for(; (end!=str.end() && (end==iter || *end!=']')); ++end);
+ string::const_iterator end = iter;
+ for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
if(end==str.end())
throw InvalidParameterValue("Unmatched '['");
- uint8_t mask[32]={0};
- unsigned type=0;
- bool range=false;
- unsigned char first=0, last=0;
+ unsigned char mask[32] = {0};
+ unsigned type = 0;
+ bool range = false;
+ unsigned char first=0, last = 0;
for(string::const_iterator i=iter; i!=end; ++i)
{
- unsigned char c=*i;
+ unsigned char c = *i;
if(range)
{
- last=c;
+ last = c;
for(unsigned j=first; j<=c; ++j)
- mask[j>>3]|=1<<(j&7);
- range=false;
+ mask[j>>3] |= 1<<(j&7);
+ range = false;
if(type<2)
- type=2;
+ type = 2;
}
else if(c=='-' && i!=iter && end-i>1)
- range=true;
+ range = true;
else
{
- first=c;
- mask[c>>3]|=1<<(c&7);
+ first = c;
+ mask[c>>3] |= 1<<(c&7);
if(type==0)
- type=1;
+ type = 1;
else
- type=3;
+ type = 3;
}
}
if(neg)
- result+=NEGATE;
+ result += NEGATE;
if(type==1)
{
- result+=MATCH_CHAR;
- result+=first;
+ result += MATCH_CHAR;
+ result += first;
}
else if(type==2)
{
- result+=MATCH_RANGE;
- result+=first;
- result+=last;
+ result += MATCH_RANGE;
+ result += first;
+ result += last;
}
else
{
- result+=MATCH_MASK;
+ result += MATCH_MASK;
result.append(reinterpret_cast<char *>(mask), 32);
}
- iter=end;
+ iter = end;
++iter;
return result;
}
+RegMatch Regex::match(const string &str) const
+{
+ RegMatch::GroupArray groups(n_groups);
+
+ for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
+ if(run(str, i, groups))
+ return RegMatch(str, groups);
+
+ return RegMatch();
+}
+
bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
{
- bool result=false;
+ bool result = false;
list<RunContext> ctx;
ctx.push_back(RunContext());
- ctx.front().citer=code.begin();
+ ctx.front().citer = code.begin();
ctx.front().groups.resize(groups.size());
for(string::const_iterator i=begin;;)
{
int c;
if(i!=str.end())
- c=static_cast<unsigned char>(*i);
+ c = static_cast<unsigned char>(*i);
else
- c=-1;
+ c = -1;
for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
{
- bool terminate=false;
- bool negate_match=false;
+ bool terminate = false;
+ bool negate_match = false;
for(; j->citer!=code.end();)
{
- Instruction instr=static_cast<Instruction>(*j->citer);
- if(instr>LAST_INSTRUCTION_)
- instr=MATCH_CHAR;
- else
- ++j->citer;
+ Instruction instr = static_cast<Instruction>(*j->citer++);
if(instr==NEGATE)
- negate_match=true;
+ negate_match = true;
else if(instr==JUMP)
{
- Offset offset=read_int<Offset>(j->citer);
- j->citer+=offset;
+ Offset offset = read_int<Offset>(j->citer);
+ j->citer += offset;
}
else if(instr==ND_JUMP)
{
- Offset offset=read_int<Offset>(j->citer);
+ Offset offset = read_int<Offset>(j->citer);
ctx.push_back(*j);
- ctx.back().citer+=offset;
+ ctx.back().citer += offset;
}
else if(instr==GROUP_BEGIN)
{
- Index n=read_int<Index>(j->citer);
+ Index n = read_int<Index>(j->citer);
if(!j->groups[n].match)
- j->groups[n].begin=i-str.begin();
+ j->groups[n].begin = i-str.begin();
}
else if(instr==GROUP_END)
{
- Index n=read_int<Index>(j->citer);
+ Index n = read_int<Index>(j->citer);
if(!j->groups[n].match)
{
- j->groups[n].match=true;
- j->groups[n].end=i-str.begin();
- j->groups[n].length=j->groups[n].end-j->groups[n].begin;
+ j->groups[n].match = true;
+ j->groups[n].end = i-str.begin();
+ j->groups[n].length = j->groups[n].end-j->groups[n].begin;
}
if(n==0)
{
- result=true;
- bool better=false;
+ result = true;
+ bool better = false;
for(unsigned k=0; (k<groups.size() && !better); ++k)
{
- better=group_compare(j->groups[k], groups[k]);
+ better = group_compare(j->groups[k], groups[k]);
if(group_compare(groups[k], j->groups[k]))
break;
}
if(better)
- groups=j->groups;
+ groups = j->groups;
}
}
else
{
- bool match_result=false;
- bool input_consumed=false;
+ bool match_result = false;
+ bool input_consumed = false;
if(instr==MATCH_BEGIN)
- match_result=(i==str.begin());
+ match_result = (i==str.begin());
else if(instr==MATCH_END)
- match_result=(i==str.end());
+ match_result = (i==str.end());
else if(instr==MATCH_CHAR)
{
- match_result=(c==*j->citer++);
- input_consumed=true;
+ match_result = (c==*j->citer++);
+ input_consumed = true;
}
else if(instr==MATCH_RANGE)
{
- unsigned char first=*j->citer++;
- unsigned char last=*j->citer++;
- match_result=(c>=first && c<=last);
- input_consumed=true;
+ unsigned char first = *j->citer++;
+ unsigned char last = *j->citer++;
+ match_result = (c>=first && c<=last);
+ input_consumed = true;
}
else if(instr==MATCH_MASK)
{
- uint8_t mask[32];
- for(unsigned k=0; k<32; ++k)
- mask[k]=*j->citer++;
- match_result=mask[c>>3]&(1<<(c&7));
- input_consumed=true;
+ if(c>=0 && c<=0xFF)
+ {
+ unsigned char m = *(j->citer+(c>>3));
+ match_result = m&(1<<(c&7));
+ }
+ input_consumed = true;
+ j->citer += 32;
}
else if(instr==MATCH_ANY)
{
- match_result=true;
- input_consumed=true;
+ match_result = true;
+ input_consumed = true;
}
else
throw Exception("Invalid instruction");
if(match_result==negate_match)
- terminate=true;
- negate_match=false;
+ terminate = true;
+ negate_match = false;
if(input_consumed || terminate)
break;
}
if(terminate || j->citer==code.end())
- j=ctx.erase(j);
+ j = ctx.erase(j);
else
++j;
}
return g1.end>g2.end;
}
+string Regex::disassemble() const
+{
+ ostringstream ss;
+
+ for(Code::const_iterator i=code.begin(); i!=code.end();)
+ {
+ Code::const_iterator j = i;
+ Offset offset = i-code.begin();
+ string decompiled = disassemble_instruction(i);
+ string bytes;
+ for(; j!=i; ++j)
+ bytes += format(" %02X", static_cast<int>(*j)&0xFF);
+ ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
+ if(bytes.size()>9)
+ ss<<"\n"<<Fmt("%15s");
+ ss<<" "<<decompiled<<'\n';
+ }
+
+ return ss.str();
+}
+
string Regex::disassemble_instruction(Code::const_iterator &i) const
{
- Instruction instr=static_cast<Instruction>(*i);
- if(instr>=LAST_INSTRUCTION_)
- instr=MATCH_CHAR;
- else
- ++i;
+ Instruction instr = static_cast<Instruction>(*i++);
ostringstream result;
switch(instr)
{
case JUMP:
{
- Offset offset=read_int<Offset>(i);
+ Offset offset = read_int<Offset>(i);
result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
}
break;
case ND_JUMP:
{
- Offset offset=read_int<Offset>(i);
+ Offset offset = read_int<Offset>(i);
result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
}
break;
break;
case MATCH_CHAR:
{
- char c=*i++;
+ char c = *i++;
result<<"MATCH_CHAR ";
if(c>=0x20 && c<=0x7E)
result<<'\''<<c<<'\'';
case MATCH_ANY:
result<<"MATCH_ANY";
break;
- case FIRST_INSTRUCTION_:
- case LAST_INSTRUCTION_:
+ default:
result<<"UNKNOWN "<<instr;
}