12 /** Writes an integer to a Regex code string, in little-endian order. */
14 void write_int(T n, basic_string<unsigned char> &code)
16 for(unsigned i=0; i<sizeof(T); ++i)
17 code += (n>>(i*8))&0xFF;
20 /** Reads an integer from a Regex code string, in little-endian order. */
22 T read_int(basic_string<unsigned char>::const_iterator &c)
25 for(unsigned i=0; i<sizeof(T); ++i)
26 result += (*c++)<<(i*8);
35 bad_regex::bad_regex(const string &w, const string &e, const string::const_iterator &i):
36 logic_error(w+"\n"+make_where(e, i))
39 string bad_regex::make_where(const string &e, const string::const_iterator &i)
42 string::size_type offset = i-e.begin();
45 result = e.substr(offset-40, 60);
49 result = e.substr(0, 60);
51 result.append(offset, ' ');
57 Regex::Regex(const string &expr)
59 auto iter = expr.begin();
60 code = compile(expr, iter, n_groups, false);
64 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
66 bool has_branches = false;
67 stack<string::const_iterator> parens;
70 string::const_iterator end;
71 for(end=iter; end!=expr.end(); ++end)
77 if(bracket==3 && *end==']')
79 else if(bracket==1 && *end=='^')
93 throw bad_regex("unmatched ')'", expr, end);
99 else if(*end=='|' && parens.empty())
111 throw bad_regex("unmatched '('", expr, parens.top());
115 unsigned this_group = group;
118 result += GROUP_BEGIN;
119 write_int<Index>(this_group, result);
122 const unsigned jump_size = 1+sizeof(Offset);
126 for(auto i=iter; i!=end;)
128 Code atom = parse_atom(expr, i, group);
130 Count repeat_min = 1;
131 Count repeat_max = 1;
133 parse_repeat(expr, i, repeat_min, repeat_max);
135 for(unsigned j=0; j<repeat_min; ++j)
137 if(repeat_max==numeric_limits<Count>::max())
142 write_int<Offset>(atom.size()+jump_size, result);
146 write_int<Offset>(-static_cast<Offset>(atom.size()+jump_size), result);
148 else if(repeat_max>repeat_min)
150 for(unsigned j=repeat_min; j<repeat_max; ++j)
153 write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
161 vector<Code> branches;
164 branches.push_back(compile(expr, i, group, true));
170 unsigned n_branches = branches.size();
172 Offset offset = (n_branches-1)*jump_size+branches.front().size();
173 for(auto i=++branches.begin(); i!=branches.end(); ++i)
176 write_int<Offset>(offset, result);
180 for(auto i=branches.begin(); i!=branches.end();)
183 offset -= i->size()+jump_size;
185 if(i!=branches.end())
188 write_int<Offset>(offset, result);
196 write_int<Index>(this_group, result);
204 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
215 throw bad_regex("stray backslash", expr, i-1);
221 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
222 throw bad_regex("invalid atom", expr, i);
224 return parse_brackets(expr, i);
228 result += MATCH_BEGIN;
234 result = compile(expr, ++i, group, false);
242 result += MATCH_CHAR;
251 bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax)
253 if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
256 if(*i=='*' || *i=='+')
257 rmax = numeric_limits<Count>::max();
258 if(*i=='*' || *i=='?')
265 for(++i; isdigit(*i); ++i)
266 rmin = rmin*10+(*i-'0');
274 for(; isdigit(*i); ++i)
275 rmax = rmax*10+(*i-'0');
277 throw bad_regex("invalid bound", expr, begin);
280 rmax = numeric_limits<Count>::max();
285 throw bad_regex("invalid bound", expr, begin);
293 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
307 for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
309 throw bad_regex("unmatched '['", str, begin);
311 unsigned char mask[32] = {0};
314 unsigned char first = 0, last = 0;
315 for(auto i=iter; i!=end; ++i)
317 unsigned char c = *i;
321 for(unsigned j=first; j<=c; ++j)
322 mask[j>>3] |= 1<<(j&7);
327 else if(c=='-' && i!=iter && end-i>1)
332 mask[c>>3] |= 1<<(c&7);
345 result += MATCH_CHAR;
350 result += MATCH_RANGE;
356 result += MATCH_MASK;
357 result.append(mask, 32);
366 RegMatch Regex::match(const string &str) const
368 vector<RegMatch::Group> groups(n_groups);
370 for(auto i=str.begin(); i!=str.end(); ++i)
371 if(run(str, i, groups))
372 return RegMatch(str, groups);
377 bool Regex::run(const string &str, const string::const_iterator &begin, vector<RegMatch::Group> &groups) const
380 list<RunContext> ctx;
381 ctx.push_back(RunContext());
382 ctx.front().citer = code.begin();
383 ctx.front().groups.resize(groups.size());
393 for(auto j=ctx.begin(); j!=ctx.end();)
395 bool terminate = false;
396 bool negate_match = false;
397 for(; j->citer!=code.end();)
399 Instruction instr = static_cast<Instruction>(*j->citer++);
405 Offset offset = read_int<Offset>(j->citer);
408 else if(instr==ND_JUMP)
410 Offset offset = read_int<Offset>(j->citer);
412 ctx.back().citer += offset;
414 else if(instr==GROUP_BEGIN)
416 Index n = read_int<Index>(j->citer);
417 if(!j->groups[n].match)
418 j->groups[n].begin = i-str.begin();
420 else if(instr==GROUP_END)
422 Index n = read_int<Index>(j->citer);
423 if(!j->groups[n].match)
425 j->groups[n].match = true;
426 j->groups[n].end = i-str.begin();
427 j->groups[n].length = j->groups[n].end-j->groups[n].begin;
434 for(unsigned k=0; (k<groups.size() && !better); ++k)
436 better = group_compare(j->groups[k], groups[k]);
437 if(group_compare(groups[k], j->groups[k]))
446 bool match_result = false;
447 bool input_consumed = false;
448 if(instr==MATCH_BEGIN)
449 match_result = (i==str.begin());
450 else if(instr==MATCH_END)
451 match_result = (i==str.end());
452 else if(instr==MATCH_CHAR)
454 match_result = (c==*j->citer++);
455 input_consumed = true;
457 else if(instr==MATCH_RANGE)
459 unsigned char first = *j->citer++;
460 unsigned char last = *j->citer++;
461 match_result = (c>=first && c<=last);
462 input_consumed = true;
464 else if(instr==MATCH_MASK)
468 unsigned char m = *(j->citer+(c>>3));
469 match_result = m&(1<<(c&7));
471 input_consumed = true;
474 else if(instr==MATCH_ANY)
477 input_consumed = true;
480 throw logic_error("invalid instruction in regex bytecode");
482 if(match_result==negate_match)
484 negate_match = false;
486 if(input_consumed || terminate)
491 if(terminate || j->citer==code.end())
497 if(i==str.end() || ctx.empty())
505 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
510 // Any match is better than no match
514 // Earlier match is better
515 if(g1.begin<g2.begin)
517 if(g1.begin>g2.begin)
520 // Longer match at same position is better
521 return g1.end>g2.end;
524 string Regex::disassemble() const
528 for(auto i=code.begin(); i!=code.end();)
531 Offset offset = i-code.begin();
532 string decompiled = disassemble_instruction(i);
535 bytes += format(" %02X", *j);
536 result += format("%3d:%-9s ", offset, bytes);
539 result += decompiled;
546 string Regex::disassemble_instruction(Code::const_iterator &i) const
548 Instruction instr = static_cast<Instruction>(*i++);
554 Offset offset = read_int<Offset>(i);
555 return format("JUMP %+d (%d)", offset, (i-code.begin())+offset);
559 Offset offset = read_int<Offset>(i);
560 return format("ND_JUMP %+d (%d)", offset, (i-code.begin())+offset);
563 return format("GROUP_BEGIN %d", read_int<Index>(i));
565 return format("GROUP_END %d", read_int<Index>(i));
569 return "MATCH_BEGIN";
574 unsigned char c = *i++;
575 if(c>=0x20 && c<=0x7E)
576 return format("MATCH_CHAR '%c'", c);
578 return format("MATCH_CHAR %d", c);
585 return format("MATCH_RANGE %d-%d", begin, end);
589 string result = "MATCH_MASK";
590 for(unsigned j=0; j<32; ++j)
591 result += format(" %02X", *i++);
597 return format("UNKNOWN %d", instr);