11 /** Writes an integer to a Regex code string, in little-endian order. */
13 void write_int(T n, basic_string<unsigned char> &code)
15 for(unsigned i=0; i<sizeof(T); ++i)
16 code += (n>>(i*8))&0xFF;
19 /** Reads an integer from a Regex code string, in little-endian order. */
21 T read_int(basic_string<unsigned char>::const_iterator &c)
24 for(unsigned i=0; i<sizeof(T); ++i)
25 result += (*c++)<<(i*8);
34 bad_regex::bad_regex(const string &w, const string &e, const string::const_iterator &i):
35 logic_error(w+"\n"+make_where(e, i))
38 string bad_regex::make_where(const string &e, const string::const_iterator &i)
41 string::size_type offset = i-e.begin();
44 result = e.substr(offset-40, 60);
48 result = e.substr(0, 60);
50 result.append(offset, ' ');
56 Regex::Regex(const string &expr)
59 string::const_iterator iter = expr.begin();
60 code = compile(expr, iter, n_groups, false);
64 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
66 bool has_branches = false;
67 stack<string::const_iterator> parens;
70 string::const_iterator end;
71 for(end=iter; end!=expr.end(); ++end)
77 if(bracket==3 && *end==']')
79 else if(bracket==1 && *end=='^')
93 throw bad_regex("unmatched ')'", expr, end);
99 else if(*end=='|' && parens.empty())
111 throw bad_regex("unmatched '('", expr, parens.top());
115 unsigned this_group = group;
118 result += GROUP_BEGIN;
119 write_int<Index>(this_group, result);
122 const unsigned jump_size = 1+sizeof(Offset);
126 for(string::const_iterator i=iter; i!=end;)
128 Code atom = parse_atom(expr, i, group);
130 Count repeat_min = 1;
131 Count repeat_max = 1;
132 parse_repeat(expr, i, repeat_min, repeat_max);
134 for(unsigned j=0; j<repeat_min; ++j)
136 if(repeat_max==numeric_limits<Count>::max())
141 write_int<Offset>(atom.size()+jump_size, result);
145 write_int<Offset>(-(atom.size()+jump_size), result);
147 else if(repeat_max>repeat_min)
149 for(unsigned j=repeat_min; j<repeat_max; ++j)
152 write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
161 for(string::const_iterator i=iter;;)
163 branches.push_back(compile(expr, i, group, true));
169 unsigned n_branches = branches.size();
171 Offset offset = (n_branches-1)*jump_size+branches.front().size();
172 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
175 write_int<Offset>(offset, result);
179 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
182 offset -= i->size()+jump_size;
184 if(i!=branches.end())
187 write_int<Offset>(offset, result);
195 write_int<Index>(this_group, result);
203 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
214 throw bad_regex("stray backslash", expr, i-1);
220 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
221 throw bad_regex("invalid atom", expr, i);
223 return parse_brackets(expr, i);
227 result += MATCH_BEGIN;
233 result = compile(expr, ++i, group, false);
241 result += MATCH_CHAR;
250 bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax)
252 if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
255 if(*i=='*' || *i=='+')
256 rmax = numeric_limits<Count>::max();
257 if(*i=='*' || *i=='?')
261 string::const_iterator begin = i;
264 for(++i; isdigit(*i); ++i)
265 rmin = rmin*10+(*i-'0');
273 for(; isdigit(*i); ++i)
274 rmax = rmax*10+(*i-'0');
276 throw bad_regex("invalid bound", expr, begin);
279 rmax = numeric_limits<Count>::max();
284 throw bad_regex("invalid bound", expr, begin);
292 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
294 string::const_iterator begin = iter;
305 string::const_iterator end = iter;
306 for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
308 throw bad_regex("unmatched '['", str, begin);
310 unsigned char mask[32] = {0};
313 unsigned char first = 0, last = 0;
314 for(string::const_iterator i=iter; i!=end; ++i)
316 unsigned char c = *i;
320 for(unsigned j=first; j<=c; ++j)
321 mask[j>>3] |= 1<<(j&7);
326 else if(c=='-' && i!=iter && end-i>1)
331 mask[c>>3] |= 1<<(c&7);
344 result += MATCH_CHAR;
349 result += MATCH_RANGE;
355 result += MATCH_MASK;
356 result.append(mask, 32);
365 RegMatch Regex::match(const string &str) const
367 RegMatch::GroupArray groups(n_groups);
369 for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
370 if(run(str, i, groups))
371 return RegMatch(str, groups);
376 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
379 list<RunContext> ctx;
380 ctx.push_back(RunContext());
381 ctx.front().citer = code.begin();
382 ctx.front().groups.resize(groups.size());
384 for(string::const_iterator i=begin;;)
392 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
394 bool terminate = false;
395 bool negate_match = false;
396 for(; j->citer!=code.end();)
398 Instruction instr = static_cast<Instruction>(*j->citer++);
404 Offset offset = read_int<Offset>(j->citer);
407 else if(instr==ND_JUMP)
409 Offset offset = read_int<Offset>(j->citer);
411 ctx.back().citer += offset;
413 else if(instr==GROUP_BEGIN)
415 Index n = read_int<Index>(j->citer);
416 if(!j->groups[n].match)
417 j->groups[n].begin = i-str.begin();
419 else if(instr==GROUP_END)
421 Index n = read_int<Index>(j->citer);
422 if(!j->groups[n].match)
424 j->groups[n].match = true;
425 j->groups[n].end = i-str.begin();
426 j->groups[n].length = j->groups[n].end-j->groups[n].begin;
433 for(unsigned k=0; (k<groups.size() && !better); ++k)
435 better = group_compare(j->groups[k], groups[k]);
436 if(group_compare(groups[k], j->groups[k]))
445 bool match_result = false;
446 bool input_consumed = false;
447 if(instr==MATCH_BEGIN)
448 match_result = (i==str.begin());
449 else if(instr==MATCH_END)
450 match_result = (i==str.end());
451 else if(instr==MATCH_CHAR)
453 match_result = (c==*j->citer++);
454 input_consumed = true;
456 else if(instr==MATCH_RANGE)
458 unsigned char first = *j->citer++;
459 unsigned char last = *j->citer++;
460 match_result = (c>=first && c<=last);
461 input_consumed = true;
463 else if(instr==MATCH_MASK)
467 unsigned char m = *(j->citer+(c>>3));
468 match_result = m&(1<<(c&7));
470 input_consumed = true;
473 else if(instr==MATCH_ANY)
476 input_consumed = true;
479 throw logic_error("invalid instruction in regex bytecode");
481 if(match_result==negate_match)
483 negate_match = false;
485 if(input_consumed || terminate)
490 if(terminate || j->citer==code.end())
496 if(i==str.end() || ctx.empty())
504 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
509 // Any match is better than no match
513 // Earlier match is better
514 if(g1.begin<g2.begin)
516 if(g1.begin>g2.begin)
519 // Longer match at same position is better
520 return g1.end>g2.end;
523 string Regex::disassemble() const
527 for(Code::const_iterator i=code.begin(); i!=code.end();)
529 Code::const_iterator j = i;
530 Offset offset = i-code.begin();
531 string decompiled = disassemble_instruction(i);
534 bytes += format(" %02X", *j);
535 result += format("%3d:%-9s ", offset, bytes);
538 result += decompiled;
545 string Regex::disassemble_instruction(Code::const_iterator &i) const
547 Instruction instr = static_cast<Instruction>(*i++);
553 Offset offset = read_int<Offset>(i);
554 return format("JUMP %+d (%d)", offset, (i-code.begin())+offset);
558 Offset offset = read_int<Offset>(i);
559 return format("ND_JUMP %+d (%d)", offset, (i-code.begin())+offset);
562 return format("GROUP_BEGIN %d", read_int<Index>(i));
564 return format("GROUP_END %d", read_int<Index>(i));
568 return "MATCH_BEGIN";
573 unsigned char c = *i++;
574 if(c>=0x20 && c<=0x7E)
575 return format("MATCH_CHAR '%c'", c);
577 return format("MATCH_CHAR %d", c);
584 return format("MATCH_RANGE %d-%d", begin, end);
588 string result = "MATCH_MASK";
589 for(unsigned j=0; j<32; ++j)
590 result += format(" %02X", *i++);
596 return format("UNKNOWN %d", instr);