3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
10 #include <msp/core/except.h>
18 /** Writes an integer to a Regex code string, in little-endian order. */
20 void write_int(T n, Msp::Regex::Code &code)
22 for(unsigned i=0; i<sizeof(T); ++i)
23 code += (n>>i*8)&0xFF;
26 /** Reads an integer from a Regex code stream, in little-endian order. */
28 T read_int(Msp::Regex::Code::const_iterator &c)
31 for(unsigned i=0; i<sizeof(T); ++i)
32 result += static_cast<unsigned char>(*c++)<<i*8;
41 Regex::Regex(const string &expr)
44 string::const_iterator iter = expr.begin();
45 code = compile(expr, iter, n_groups, false);
49 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
51 bool has_branches = false;
55 string::const_iterator end;
56 for(end=iter; end!=expr.end(); ++end)
62 if(bracket==3 && *end==']')
64 else if(bracket==1 && *end=='^')
78 throw InvalidParameterValue("Unexpected )");
84 else if(*end=='|' && level==0)
96 throw InvalidParameterValue("Unmatched (");
100 unsigned this_group = group;
103 result += GROUP_BEGIN;
104 write_int<Index>(this_group, result);
107 const unsigned jump_size = 1+sizeof(Offset);
111 for(string::const_iterator i=iter; i!=end;)
113 Code atom = parse_atom(expr, i, group);
115 Count repeat_min = 1;
116 Count repeat_max = 1;
117 parse_repeat(i, repeat_min, repeat_max);
119 for(unsigned j=0; j<repeat_min; ++j)
121 if(repeat_max==numeric_limits<Count>::max())
126 write_int<Offset>(atom.size()+jump_size, result);
130 write_int<Offset>(-(atom.size()+jump_size), result);
132 else if(repeat_max>repeat_min)
134 for(unsigned j=repeat_min; j<repeat_max; ++j)
137 write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
146 for(string::const_iterator i=iter;;)
148 branches.push_back(compile(expr, i, group, true));
154 unsigned n_branches = branches.size();
156 Offset offset = (n_branches-1)*jump_size+branches.front().size();
157 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
160 write_int<Offset>(offset, result);
164 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
167 offset -= i->size()+jump_size;
169 if(i!=branches.end())
172 write_int<Offset>(offset, result);
180 write_int<Index>(this_group, result);
188 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
199 throw InvalidParameterValue("Stray backslash");
205 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
206 throw InvalidParameterValue("Invalid atom");
208 return parse_brackets(expr, i);
212 result += MATCH_BEGIN;
218 result = compile(expr, ++i, group, false);
226 result += MATCH_CHAR;
235 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
237 if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
240 if(*i=='*' || *i=='+')
241 rmax = numeric_limits<Count>::max();
242 if(*i=='*' || *i=='?')
247 for(++i; isdigit(*i); ++i)
248 rmin = rmin*10+(*i-'0');
256 for(; isdigit(*i); ++i)
257 rmax = rmax*10+(*i-'0');
259 throw InvalidParameterValue("Invalid bound");
262 rmax = numeric_limits<Count>::max();
267 throw InvalidParameterValue("Invalid bound");
275 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
287 string::const_iterator end = iter;
288 for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
290 throw InvalidParameterValue("Unmatched '['");
292 unsigned char mask[32] = {0};
295 unsigned char first=0, last = 0;
296 for(string::const_iterator i=iter; i!=end; ++i)
298 unsigned char c = *i;
302 for(unsigned j=first; j<=c; ++j)
303 mask[j>>3] |= 1<<(j&7);
308 else if(c=='-' && i!=iter && end-i>1)
313 mask[c>>3] |= 1<<(c&7);
326 result += MATCH_CHAR;
331 result += MATCH_RANGE;
337 result += MATCH_MASK;
338 result.append(reinterpret_cast<char *>(mask), 32);
347 RegMatch Regex::match(const string &str) const
349 RegMatch::GroupArray groups(n_groups);
351 for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
352 if(run(str, i, groups))
353 return RegMatch(str, groups);
358 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
361 list<RunContext> ctx;
362 ctx.push_back(RunContext());
363 ctx.front().citer = code.begin();
364 ctx.front().groups.resize(groups.size());
366 for(string::const_iterator i=begin;;)
370 c = static_cast<unsigned char>(*i);
374 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
376 bool terminate = false;
377 bool negate_match = false;
378 for(; j->citer!=code.end();)
380 Instruction instr = static_cast<Instruction>(*j->citer++);
386 Offset offset = read_int<Offset>(j->citer);
389 else if(instr==ND_JUMP)
391 Offset offset = read_int<Offset>(j->citer);
393 ctx.back().citer += offset;
395 else if(instr==GROUP_BEGIN)
397 Index n = read_int<Index>(j->citer);
398 if(!j->groups[n].match)
399 j->groups[n].begin = i-str.begin();
401 else if(instr==GROUP_END)
403 Index n = read_int<Index>(j->citer);
404 if(!j->groups[n].match)
406 j->groups[n].match = true;
407 j->groups[n].end = i-str.begin();
408 j->groups[n].length = j->groups[n].end-j->groups[n].begin;
415 for(unsigned k=0; (k<groups.size() && !better); ++k)
417 better = group_compare(j->groups[k], groups[k]);
418 if(group_compare(groups[k], j->groups[k]))
427 bool match_result = false;
428 bool input_consumed = false;
429 if(instr==MATCH_BEGIN)
430 match_result = (i==str.begin());
431 else if(instr==MATCH_END)
432 match_result = (i==str.end());
433 else if(instr==MATCH_CHAR)
435 match_result = (c==*j->citer++);
436 input_consumed = true;
438 else if(instr==MATCH_RANGE)
440 unsigned char first = *j->citer++;
441 unsigned char last = *j->citer++;
442 match_result = (c>=first && c<=last);
443 input_consumed = true;
445 else if(instr==MATCH_MASK)
449 unsigned char m = *(j->citer+(c>>3));
450 match_result = m&(1<<(c&7));
452 input_consumed = true;
455 else if(instr==MATCH_ANY)
458 input_consumed = true;
461 throw Exception("Invalid instruction");
463 if(match_result==negate_match)
465 negate_match = false;
467 if(input_consumed || terminate)
472 if(terminate || j->citer==code.end())
478 if(i==str.end() || ctx.empty())
486 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
491 // Any match is better than no match
495 // Earlier match is better
496 if(g1.begin<g2.begin)
498 if(g2.begin>g2.begin)
501 // Longer match at same position is better
502 return g1.end>g2.end;
505 string Regex::disassemble() const
509 for(Code::const_iterator i=code.begin(); i!=code.end();)
511 Code::const_iterator j = i;
512 Offset offset = i-code.begin();
513 string decompiled = disassemble_instruction(i);
516 bytes += format(" %02X", static_cast<int>(*j)&0xFF);
517 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
519 ss<<"\n"<<Fmt("%15s");
520 ss<<" "<<decompiled<<'\n';
526 string Regex::disassemble_instruction(Code::const_iterator &i) const
528 Instruction instr = static_cast<Instruction>(*i++);
530 ostringstream result;
535 Offset offset = read_int<Offset>(i);
536 result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
541 Offset offset = read_int<Offset>(i);
542 result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
546 result<<"GROUP_BEGIN "<<read_int<Index>(i);
549 result<<"GROUP_END "<<read_int<Index>(i);
555 result<<"MATCH_BEGIN";
563 result<<"MATCH_CHAR ";
564 if(c>=0x20 && c<=0x7E)
565 result<<'\''<<c<<'\'';
567 result<<(static_cast<int>(c)&0xFF);
571 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
572 result<<'-'<<(static_cast<int>(*i++)&0xFF);
575 result<<"MATCH_MASK";
576 for(unsigned j=0; j<32; ++j)
577 result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
583 result<<"UNKNOWN "<<instr;