3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
8 #include <msp/core/error.h>
17 Writes an integer to a Regex code string, in little-endian order.
20 void write_int(T n, Msp::Regex::Code &code)
22 for(unsigned i=0; i<sizeof(T); ++i)
27 Reads an integer from a Regex code stream, in little-endian order.
30 T read_int(Msp::Regex::Code::const_iterator &c)
33 for(unsigned i=0; i<sizeof(T); ++i)
34 result+=static_cast<unsigned char>(*c++)<<i*8;
42 Regex::Regex(const string &expr)
45 string::const_iterator iter=expr.begin();
46 code=compile(expr, iter, n_groups, false);
50 RegMatch Regex::match(const string &str) const
52 RegMatch::GroupArray groups(n_groups);
54 for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
55 if(run(str, i, groups))
56 return RegMatch(str, groups);
61 string Regex::disassemble() const
65 for(Code::const_iterator i=code.begin(); i!=code.end();)
67 Code::const_iterator j=i;
68 Offset offset=i-code.begin();
69 string decompiled=disassemble_instruction(i);
72 bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
73 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
75 ss<<"\n"<<Fmt("%15s");
76 ss<<" "<<decompiled<<'\n';
82 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
84 bool has_branches=false;
87 string::const_iterator end;
88 for(end=iter; end!=expr.end(); ++end)
101 throw InvalidParameterValue("Unexpected )");
117 throw InvalidParameterValue("Unmatched (");
121 unsigned this_group=group;
125 write_int<Index>(this_group, result);
128 const unsigned jump_size=1+sizeof(Offset);
132 for(string::const_iterator i=iter; i!=end;)
134 Code atom=parse_atom(expr, i, group);
138 parse_repeat(i, repeat_min, repeat_max);
140 for(unsigned j=0; j<repeat_min; ++j)
142 if(repeat_max==numeric_limits<Count>::max())
147 write_int<Offset>(atom.size()+jump_size, result);
151 write_int<Offset>(-(atom.size()+jump_size), result);
153 else if(repeat_max>repeat_min)
155 for(unsigned j=repeat_min; j<repeat_max; ++j)
158 write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
167 for(string::const_iterator i=iter;;)
169 branches.push_back(compile(expr, i, group, true));
175 unsigned n_branches=branches.size();
177 Offset offset=(n_branches-1)*jump_size+branches.front().size();
178 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
181 write_int<Offset>(offset, result);
185 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
188 offset-=i->size()+jump_size;
190 if(i!=branches.end())
193 write_int<Offset>(offset, result);
201 write_int<Index>(this_group, result);
209 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
220 throw InvalidParameterValue("Stray backslash");
226 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
227 throw InvalidParameterValue("Invalid atom");
229 return parse_brackets(expr, i);
239 result=compile(expr, ++i, group, false);
247 if(static_cast<unsigned char>(*i)<=LAST_INSTRUCTION_)
257 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
259 if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
262 if(*i=='*' || *i=='+')
263 rmax=numeric_limits<Count>::max();
264 if(*i=='*' || *i=='?')
269 for(++i; isdigit(*i); ++i)
270 rmin=rmin*10+(*i-'0');
278 for(; isdigit(*i); ++i)
279 rmax=rmax*10+(*i-'0');
281 throw InvalidParameterValue("Invalid bound");
284 rmax=numeric_limits<Count>::max();
289 throw InvalidParameterValue("Invalid bound");
297 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
309 string::const_iterator end=iter;
310 for(; (end!=str.end() && (end==iter || *end!=']')); ++end);
312 throw InvalidParameterValue("Unmatched '['");
314 uint8_t mask[32]={0};
317 unsigned char first=0, last=0;
318 for(string::const_iterator i=iter; i!=end; ++i)
324 for(unsigned j=first; j<=c; ++j)
325 mask[j>>3]|=1<<(j&7);
330 else if(c=='-' && i!=iter && end-i>1)
335 mask[c>>3]|=1<<(c&7);
360 result.append(reinterpret_cast<char *>(mask), 32);
369 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
372 list<RunContext> ctx;
373 ctx.push_back(RunContext());
374 ctx.front().citer=code.begin();
375 ctx.front().groups.resize(groups.size());
377 for(string::const_iterator i=begin;;)
381 c=static_cast<unsigned char>(*i);
385 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
387 bool terminate=false;
388 bool negate_match=false;
389 for(; j->citer!=code.end();)
391 Instruction instr=static_cast<Instruction>(*j->citer);
392 if(instr>LAST_INSTRUCTION_)
401 Offset offset=read_int<Offset>(j->citer);
404 else if(instr==ND_JUMP)
406 Offset offset=read_int<Offset>(j->citer);
408 ctx.back().citer+=offset;
410 else if(instr==GROUP_BEGIN)
412 Index n=read_int<Index>(j->citer);
413 if(!j->groups[n].match)
414 j->groups[n].begin=i-str.begin();
416 else if(instr==GROUP_END)
418 Index n=read_int<Index>(j->citer);
419 if(!j->groups[n].match)
421 j->groups[n].match=true;
422 j->groups[n].end=i-str.begin();
423 j->groups[n].length=j->groups[n].end-j->groups[n].begin;
430 for(unsigned k=0; (k<groups.size() && !better); ++k)
432 better=group_compare(j->groups[k], groups[k]);
433 if(group_compare(groups[k], j->groups[k]))
442 bool match_result=false;
443 bool input_consumed=false;
444 if(instr==MATCH_BEGIN)
445 match_result=(i==str.begin());
446 else if(instr==MATCH_END)
447 match_result=(i==str.end());
448 else if(instr==MATCH_CHAR)
450 match_result=(c==*j->citer++);
453 else if(instr==MATCH_RANGE)
455 unsigned char first=*j->citer++;
456 unsigned char last=*j->citer++;
457 match_result=(c>=first && c<=last);
460 else if(instr==MATCH_MASK)
463 for(unsigned k=0; k<32; ++k)
465 match_result=mask[c>>3]&(1<<(c&7));
468 else if(instr==MATCH_ANY)
474 throw Exception("Invalid instruction");
476 if(match_result==negate_match)
480 if(input_consumed || terminate)
485 if(terminate || j->citer==code.end())
491 if(i==str.end() || ctx.empty())
499 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
504 // Any match is better than no match
508 // Earlier match is better
509 if(g1.begin<g2.begin)
511 if(g2.begin>g2.begin)
514 // Longer match at same position is better
515 return g1.end>g2.end;
518 string Regex::disassemble_instruction(Code::const_iterator &i) const
520 Instruction instr=static_cast<Instruction>(*i);
521 if(instr>=LAST_INSTRUCTION_)
526 ostringstream result;
531 Offset offset=read_int<Offset>(i);
532 result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
537 Offset offset=read_int<Offset>(i);
538 result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
542 result<<"GROUP_BEGIN "<<read_int<Index>(i);
545 result<<"GROUP_END "<<read_int<Index>(i);
551 result<<"MATCH_BEGIN";
559 result<<"MATCH_CHAR ";
560 if(c>=0x20 && c<=0x7E)
561 result<<'\''<<c<<'\'';
563 result<<(static_cast<int>(c)&0xFF);
567 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
568 result<<'-'<<(static_cast<int>(*i++)&0xFF);
571 result<<"MATCH_MASK";
572 for(unsigned j=0; j<32; ++j)
573 result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
578 case FIRST_INSTRUCTION_:
579 case LAST_INSTRUCTION_:
580 result<<"UNKNOWN "<<instr;