5 #include <msp/core/except.h>
13 /** Writes an integer to a Regex code string, in little-endian order. */
15 void write_int(T n, basic_string<unsigned char> &code)
17 for(unsigned i=0; i<sizeof(T); ++i)
18 code += (n>>(i*8))&0xFF;
21 /** Reads an integer from a Regex code string, in little-endian order. */
23 T read_int(basic_string<unsigned char>::const_iterator &c)
26 for(unsigned i=0; i<sizeof(T); ++i)
27 result += (*c++)<<(i*8);
36 bad_regex::bad_regex(const string &w, const string &e, const string::const_iterator &i):
37 logic_error(w+"\n"+make_where(e, i))
40 string bad_regex::make_where(const string &e, const string::const_iterator &i)
43 string::size_type offset = i-e.begin();
46 result = e.substr(offset-40, 60);
50 result = e.substr(0, 60);
52 result.append(offset, ' ');
58 Regex::Regex(const string &expr)
60 auto iter = expr.begin();
61 code = compile(expr, iter, n_groups, false);
65 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
67 bool has_branches = false;
68 stack<string::const_iterator> parens;
71 string::const_iterator end;
72 for(end=iter; end!=expr.end(); ++end)
78 if(bracket==3 && *end==']')
80 else if(bracket==1 && *end=='^')
94 throw bad_regex("unmatched ')'", expr, end);
100 else if(*end=='|' && parens.empty())
112 throw bad_regex("unmatched '('", expr, parens.top());
116 unsigned this_group = group;
119 result += GROUP_BEGIN;
120 write_int<Index>(this_group, result);
123 const unsigned jump_size = 1+sizeof(Offset);
127 for(auto i=iter; i!=end;)
129 Code atom = parse_atom(expr, i, group);
131 Count repeat_min = 1;
132 Count repeat_max = 1;
134 parse_repeat(expr, i, repeat_min, repeat_max);
136 for(unsigned j=0; j<repeat_min; ++j)
138 if(repeat_max==numeric_limits<Count>::max())
143 write_int<Offset>(atom.size()+jump_size, result);
147 write_int<Offset>(-static_cast<Offset>(atom.size()+jump_size), result);
149 else if(repeat_max>repeat_min)
151 for(unsigned j=repeat_min; j<repeat_max; ++j)
154 write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
162 vector<Code> branches;
165 branches.push_back(compile(expr, i, group, true));
171 unsigned n_branches = branches.size();
173 Offset offset = (n_branches-1)*jump_size+branches.front().size();
174 for(auto i=++branches.begin(); i!=branches.end(); ++i)
177 write_int<Offset>(offset, result);
181 for(auto i=branches.begin(); i!=branches.end();)
184 offset -= i->size()+jump_size;
186 if(i!=branches.end())
189 write_int<Offset>(offset, result);
197 write_int<Index>(this_group, result);
205 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
216 throw bad_regex("stray backslash", expr, i-1);
222 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
223 throw bad_regex("invalid atom", expr, i);
225 return parse_brackets(expr, i);
229 result += MATCH_BEGIN;
235 result = compile(expr, ++i, group, false);
243 result += MATCH_CHAR;
252 bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax)
254 if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
257 if(*i=='*' || *i=='+')
258 rmax = numeric_limits<Count>::max();
259 if(*i=='*' || *i=='?')
266 for(++i; isdigit(*i); ++i)
267 rmin = rmin*10+(*i-'0');
275 for(; isdigit(*i); ++i)
276 rmax = rmax*10+(*i-'0');
278 throw bad_regex("invalid bound", expr, begin);
281 rmax = numeric_limits<Count>::max();
286 throw bad_regex("invalid bound", expr, begin);
294 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
308 for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
310 throw bad_regex("unmatched '['", str, begin);
312 unsigned char mask[32] = {0};
315 unsigned char first = 0, last = 0;
316 for(auto i=iter; i!=end; ++i)
318 unsigned char c = *i;
322 for(unsigned j=first; j<=c; ++j)
323 mask[j>>3] |= 1<<(j&7);
328 else if(c=='-' && i!=iter && end-i>1)
333 mask[c>>3] |= 1<<(c&7);
346 result += MATCH_CHAR;
351 result += MATCH_RANGE;
357 result += MATCH_MASK;
358 result.append(mask, 32);
367 RegMatch Regex::match(const string &str) const
369 vector<RegMatch::Group> groups(n_groups);
371 for(auto i=str.begin(); i!=str.end(); ++i)
372 if(run(str, i, groups))
373 return RegMatch(str, groups);
378 bool Regex::run(const string &str, const string::const_iterator &begin, vector<RegMatch::Group> &groups) const
381 list<RunContext> ctx;
382 ctx.push_back(RunContext());
383 ctx.front().citer = code.begin();
384 ctx.front().groups.resize(groups.size());
394 for(auto j=ctx.begin(); j!=ctx.end();)
396 bool terminate = false;
397 bool negate_match = false;
398 for(; j->citer!=code.end();)
400 Instruction instr = static_cast<Instruction>(*j->citer++);
406 Offset offset = read_int<Offset>(j->citer);
409 else if(instr==ND_JUMP)
411 Offset offset = read_int<Offset>(j->citer);
413 ctx.back().citer += offset;
415 else if(instr==GROUP_BEGIN)
417 Index n = read_int<Index>(j->citer);
418 if(!j->groups[n].match)
419 j->groups[n].begin = i-str.begin();
421 else if(instr==GROUP_END)
423 Index n = read_int<Index>(j->citer);
424 if(!j->groups[n].match)
426 j->groups[n].match = true;
427 j->groups[n].end = i-str.begin();
428 j->groups[n].length = j->groups[n].end-j->groups[n].begin;
435 for(unsigned k=0; (k<groups.size() && !better); ++k)
437 better = group_compare(j->groups[k], groups[k]);
438 if(group_compare(groups[k], j->groups[k]))
447 bool match_result = false;
448 bool input_consumed = false;
449 if(instr==MATCH_BEGIN)
450 match_result = (i==str.begin());
451 else if(instr==MATCH_END)
452 match_result = (i==str.end());
453 else if(instr==MATCH_CHAR)
455 match_result = (c==*j->citer++);
456 input_consumed = true;
458 else if(instr==MATCH_RANGE)
460 unsigned char first = *j->citer++;
461 unsigned char last = *j->citer++;
462 match_result = (c>=first && c<=last);
463 input_consumed = true;
465 else if(instr==MATCH_MASK)
469 unsigned char m = *(j->citer+(c>>3));
470 match_result = m&(1<<(c&7));
472 input_consumed = true;
475 else if(instr==MATCH_ANY)
478 input_consumed = true;
481 throw internal_error("invalid instruction in regex bytecode");
483 if(match_result==negate_match)
485 negate_match = false;
487 if(input_consumed || terminate)
492 if(terminate || j->citer==code.end())
498 if(i==str.end() || ctx.empty())
506 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
511 // Any match is better than no match
515 // Earlier match is better
516 if(g1.begin<g2.begin)
518 if(g1.begin>g2.begin)
521 // Longer match at same position is better
522 return g1.end>g2.end;
525 string Regex::disassemble() const
529 for(auto i=code.begin(); i!=code.end();)
532 Offset offset = i-code.begin();
533 string decompiled = disassemble_instruction(i);
536 bytes += format(" %02X", *j);
537 result += format("%3d:%-9s ", offset, bytes);
540 result += decompiled;
547 string Regex::disassemble_instruction(Code::const_iterator &i) const
549 Instruction instr = static_cast<Instruction>(*i++);
555 Offset offset = read_int<Offset>(i);
556 return format("JUMP %+d (%d)", offset, (i-code.begin())+offset);
560 Offset offset = read_int<Offset>(i);
561 return format("ND_JUMP %+d (%d)", offset, (i-code.begin())+offset);
564 return format("GROUP_BEGIN %d", read_int<Index>(i));
566 return format("GROUP_END %d", read_int<Index>(i));
570 return "MATCH_BEGIN";
575 unsigned char c = *i++;
576 if(c>=0x20 && c<=0x7E)
577 return format("MATCH_CHAR '%c'", c);
579 return format("MATCH_CHAR %d", c);
586 return format("MATCH_RANGE %d-%d", begin, end);
590 string result = "MATCH_MASK";
591 for(unsigned j=0; j<32; ++j)
592 result += format(" %02X", *i++);
598 return format("UNKNOWN %d", instr);