3 #include <msp/core/except.h>
11 /** Writes an integer to a Regex code string, in little-endian order. */
13 void write_int(T n, Msp::Regex::Code &code)
15 for(unsigned i=0; i<sizeof(T); ++i)
16 code += (n>>i*8)&0xFF;
19 /** Reads an integer from a Regex code stream, in little-endian order. */
21 T read_int(Msp::Regex::Code::const_iterator &c)
24 for(unsigned i=0; i<sizeof(T); ++i)
25 result += static_cast<unsigned char>(*c++)<<i*8;
34 Regex::Regex(const string &expr)
37 string::const_iterator iter = expr.begin();
38 code = compile(expr, iter, n_groups, false);
42 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
44 bool has_branches = false;
48 string::const_iterator end;
49 for(end=iter; end!=expr.end(); ++end)
55 if(bracket==3 && *end==']')
57 else if(bracket==1 && *end=='^')
71 throw InvalidParameterValue("Unexpected )");
77 else if(*end=='|' && level==0)
89 throw InvalidParameterValue("Unmatched (");
93 unsigned this_group = group;
96 result += GROUP_BEGIN;
97 write_int<Index>(this_group, result);
100 const unsigned jump_size = 1+sizeof(Offset);
104 for(string::const_iterator i=iter; i!=end;)
106 Code atom = parse_atom(expr, i, group);
108 Count repeat_min = 1;
109 Count repeat_max = 1;
110 parse_repeat(i, repeat_min, repeat_max);
112 for(unsigned j=0; j<repeat_min; ++j)
114 if(repeat_max==numeric_limits<Count>::max())
119 write_int<Offset>(atom.size()+jump_size, result);
123 write_int<Offset>(-(atom.size()+jump_size), result);
125 else if(repeat_max>repeat_min)
127 for(unsigned j=repeat_min; j<repeat_max; ++j)
130 write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
139 for(string::const_iterator i=iter;;)
141 branches.push_back(compile(expr, i, group, true));
147 unsigned n_branches = branches.size();
149 Offset offset = (n_branches-1)*jump_size+branches.front().size();
150 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
153 write_int<Offset>(offset, result);
157 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
160 offset -= i->size()+jump_size;
162 if(i!=branches.end())
165 write_int<Offset>(offset, result);
173 write_int<Index>(this_group, result);
181 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
192 throw InvalidParameterValue("Stray backslash");
198 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
199 throw InvalidParameterValue("Invalid atom");
201 return parse_brackets(expr, i);
205 result += MATCH_BEGIN;
211 result = compile(expr, ++i, group, false);
219 result += MATCH_CHAR;
228 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
230 if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
233 if(*i=='*' || *i=='+')
234 rmax = numeric_limits<Count>::max();
235 if(*i=='*' || *i=='?')
240 for(++i; isdigit(*i); ++i)
241 rmin = rmin*10+(*i-'0');
249 for(; isdigit(*i); ++i)
250 rmax = rmax*10+(*i-'0');
252 throw InvalidParameterValue("Invalid bound");
255 rmax = numeric_limits<Count>::max();
260 throw InvalidParameterValue("Invalid bound");
268 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
280 string::const_iterator end = iter;
281 for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
283 throw InvalidParameterValue("Unmatched '['");
285 unsigned char mask[32] = {0};
288 unsigned char first=0, last = 0;
289 for(string::const_iterator i=iter; i!=end; ++i)
291 unsigned char c = *i;
295 for(unsigned j=first; j<=c; ++j)
296 mask[j>>3] |= 1<<(j&7);
301 else if(c=='-' && i!=iter && end-i>1)
306 mask[c>>3] |= 1<<(c&7);
319 result += MATCH_CHAR;
324 result += MATCH_RANGE;
330 result += MATCH_MASK;
331 result.append(reinterpret_cast<char *>(mask), 32);
340 RegMatch Regex::match(const string &str) const
342 RegMatch::GroupArray groups(n_groups);
344 for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
345 if(run(str, i, groups))
346 return RegMatch(str, groups);
351 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
354 list<RunContext> ctx;
355 ctx.push_back(RunContext());
356 ctx.front().citer = code.begin();
357 ctx.front().groups.resize(groups.size());
359 for(string::const_iterator i=begin;;)
363 c = static_cast<unsigned char>(*i);
367 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
369 bool terminate = false;
370 bool negate_match = false;
371 for(; j->citer!=code.end();)
373 Instruction instr = static_cast<Instruction>(*j->citer++);
379 Offset offset = read_int<Offset>(j->citer);
382 else if(instr==ND_JUMP)
384 Offset offset = read_int<Offset>(j->citer);
386 ctx.back().citer += offset;
388 else if(instr==GROUP_BEGIN)
390 Index n = read_int<Index>(j->citer);
391 if(!j->groups[n].match)
392 j->groups[n].begin = i-str.begin();
394 else if(instr==GROUP_END)
396 Index n = read_int<Index>(j->citer);
397 if(!j->groups[n].match)
399 j->groups[n].match = true;
400 j->groups[n].end = i-str.begin();
401 j->groups[n].length = j->groups[n].end-j->groups[n].begin;
408 for(unsigned k=0; (k<groups.size() && !better); ++k)
410 better = group_compare(j->groups[k], groups[k]);
411 if(group_compare(groups[k], j->groups[k]))
420 bool match_result = false;
421 bool input_consumed = false;
422 if(instr==MATCH_BEGIN)
423 match_result = (i==str.begin());
424 else if(instr==MATCH_END)
425 match_result = (i==str.end());
426 else if(instr==MATCH_CHAR)
428 match_result = (c==*j->citer++);
429 input_consumed = true;
431 else if(instr==MATCH_RANGE)
433 unsigned char first = *j->citer++;
434 unsigned char last = *j->citer++;
435 match_result = (c>=first && c<=last);
436 input_consumed = true;
438 else if(instr==MATCH_MASK)
442 unsigned char m = *(j->citer+(c>>3));
443 match_result = m&(1<<(c&7));
445 input_consumed = true;
448 else if(instr==MATCH_ANY)
451 input_consumed = true;
454 throw Exception("Invalid instruction");
456 if(match_result==negate_match)
458 negate_match = false;
460 if(input_consumed || terminate)
465 if(terminate || j->citer==code.end())
471 if(i==str.end() || ctx.empty())
479 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
484 // Any match is better than no match
488 // Earlier match is better
489 if(g1.begin<g2.begin)
491 if(g2.begin>g2.begin)
494 // Longer match at same position is better
495 return g1.end>g2.end;
498 string Regex::disassemble() const
502 for(Code::const_iterator i=code.begin(); i!=code.end();)
504 Code::const_iterator j = i;
505 Offset offset = i-code.begin();
506 string decompiled = disassemble_instruction(i);
509 bytes += format(" %02X", static_cast<int>(*j)&0xFF);
510 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
512 ss<<"\n"<<Fmt("%15s");
513 ss<<" "<<decompiled<<'\n';
519 string Regex::disassemble_instruction(Code::const_iterator &i) const
521 Instruction instr = static_cast<Instruction>(*i++);
523 ostringstream result;
528 Offset offset = read_int<Offset>(i);
529 result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
534 Offset offset = read_int<Offset>(i);
535 result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
539 result<<"GROUP_BEGIN "<<read_int<Index>(i);
542 result<<"GROUP_END "<<read_int<Index>(i);
548 result<<"MATCH_BEGIN";
556 result<<"MATCH_CHAR ";
557 if(c>=0x20 && c<=0x7E)
558 result<<'\''<<c<<'\'';
560 result<<(static_cast<int>(c)&0xFF);
564 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
565 result<<'-'<<(static_cast<int>(*i++)&0xFF);
568 result<<"MATCH_MASK";
569 for(unsigned j=0; j<32; ++j)
570 result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
576 result<<"UNKNOWN "<<instr;