X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fstrings%2Fregex.cpp;fp=source%2Fstrings%2Fregex.cpp;h=f45c820c80ddadf87b6e621645f02cd965ac6cdc;hp=0000000000000000000000000000000000000000;hb=b42ed73a1b241c0e93ee03c43c4584b41c549bac;hpb=5b1368cb791cab043f0435628cacbaff36e39b7b diff --git a/source/strings/regex.cpp b/source/strings/regex.cpp new file mode 100644 index 0000000..f45c820 --- /dev/null +++ b/source/strings/regex.cpp @@ -0,0 +1,589 @@ +/* $Id$ + +This file is part of libmspstrings +Copyright © 2007 Mikko Rasa +Distributed under the LGPL +*/ + +#include +#include +#include +#include "formatter.h" +#include "regex.h" + +using namespace std; + +namespace { + +/** Writes an integer to a Regex code string, in little-endian order. */ +template +void write_int(T n, Msp::Regex::Code &code) +{ + for(unsigned i=0; i>i*8)&0xFF; +} + +/** Reads an integer from a Regex code stream, in little-endian order. */ +template +T read_int(Msp::Regex::Code::const_iterator &c) +{ + T result = 0; + for(unsigned i=0; i(*c++)<0) + throw InvalidParameterValue("Unmatched ("); + + Code result; + + unsigned this_group = group; + if(!branch) + { + result += GROUP_BEGIN; + write_int(this_group, result); + } + + const unsigned jump_size = 1+sizeof(Offset); + + if(!has_branches) + { + for(string::const_iterator i=iter; i!=end;) + { + Code atom = parse_atom(expr, i, group); + + Count repeat_min = 1; + Count repeat_max = 1; + parse_repeat(i, repeat_min, repeat_max); + + for(unsigned j=0; j::max()) + { + if(repeat_min==0) + { + result += ND_JUMP; + write_int(atom.size()+jump_size, result); + result += atom; + } + result += ND_JUMP; + write_int(-(atom.size()+jump_size), result); + } + else if(repeat_max>repeat_min) + { + for(unsigned j=repeat_min; j((repeat_max-j)*(atom.size()+jump_size)-jump_size, result); + result += atom; + } + } + } + } + else + { + list branches; + for(string::const_iterator i=iter;;) + { + branches.push_back(compile(expr, i, group, true)); + if(i==end) + break; + ++i; + } + + unsigned n_branches = branches.size(); + + Offset offset = (n_branches-1)*jump_size+branches.front().size(); + for(list::iterator i=++branches.begin(); i!=branches.end(); ++i) + { + result += ND_JUMP; + write_int(offset, result); + offset += i->size(); + } + + for(list::iterator i=branches.begin(); i!=branches.end();) + { + result += *i; + offset -= i->size()+jump_size; + ++i; + if(i!=branches.end()) + { + result += JUMP; + write_int(offset, result); + } + } + } + + if(!branch) + { + result += GROUP_END; + write_int(this_group, result); + } + + iter = end; + + return result; +} + +Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group) +{ + Code result; + + if(i==expr.end()) + return result; + + bool flag = false; + if(*i=='\\') + { + if(++i==expr.end()) + throw InvalidParameterValue("Stray backslash"); + flag = true; + } + + if(!flag) + { + if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')') + throw InvalidParameterValue("Invalid atom"); + else if(*i=='[') + return parse_brackets(expr, i); + else if(*i=='.') + result += MATCH_ANY; + else if(*i=='^') + result += MATCH_BEGIN; + else if(*i=='$') + result += MATCH_END; + else if(*i=='(') + { + ++group; + result = compile(expr, ++i, group, false); + } + else + flag = true; + } + + if(flag) + { + result += MATCH_CHAR; + result += *i; + } + + ++i; + + return result; +} + +bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) +{ + if(*i!='*' && *i!='+' && *i!='?' && *i!='{') + return false; + + if(*i=='*' || *i=='+') + rmax = numeric_limits::max(); + if(*i=='*' || *i=='?') + rmin = 0; + if(*i=='{') + { + rmin = 0; + for(++i; isdigit(*i); ++i) + rmin = rmin*10+(*i-'0'); + + if(*i==',') + { + ++i; + if(*i!='}') + { + rmax = 0; + for(; isdigit(*i); ++i) + rmax = rmax*10+(*i-'0'); + if(rmax::max(); + } + else + rmax = rmin; + if(*i!='}') + throw InvalidParameterValue("Invalid bound"); + } + + ++i; + + return true; +} + +Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter) +{ + Code result; + + ++iter; + bool neg = false; + if(*iter=='^') + { + neg = true; + ++iter; + } + + string::const_iterator end = iter; + for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ; + if(end==str.end()) + throw InvalidParameterValue("Unmatched '['"); + + unsigned char mask[32] = {0}; + unsigned type = 0; + bool range = false; + unsigned char first=0, last = 0; + for(string::const_iterator i=iter; i!=end; ++i) + { + unsigned char c = *i; + if(range) + { + last = c; + for(unsigned j=first; j<=c; ++j) + mask[j>>3] |= 1<<(j&7); + range = false; + if(type<2) + type = 2; + } + else if(c=='-' && i!=iter && end-i>1) + range = true; + else + { + first = c; + mask[c>>3] |= 1<<(c&7); + if(type==0) + type = 1; + else + type = 3; + } + } + + if(neg) + result += NEGATE; + + if(type==1) + { + result += MATCH_CHAR; + result += first; + } + else if(type==2) + { + result += MATCH_RANGE; + result += first; + result += last; + } + else + { + result += MATCH_MASK; + result.append(reinterpret_cast(mask), 32); + } + + iter = end; + ++iter; + + return result; +} + +RegMatch Regex::match(const string &str) const +{ + RegMatch::GroupArray groups(n_groups); + + for(string::const_iterator i=str.begin(); i!=str.end(); ++i) + if(run(str, i, groups)) + return RegMatch(str, groups); + + return RegMatch(); +} + +bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const +{ + bool result = false; + list ctx; + ctx.push_back(RunContext()); + ctx.front().citer = code.begin(); + ctx.front().groups.resize(groups.size()); + + for(string::const_iterator i=begin;;) + { + int c; + if(i!=str.end()) + c = static_cast(*i); + else + c = -1; + + for(list::iterator j=ctx.begin(); j!=ctx.end();) + { + bool terminate = false; + bool negate_match = false; + for(; j->citer!=code.end();) + { + Instruction instr = static_cast(*j->citer++); + + if(instr==NEGATE) + negate_match = true; + else if(instr==JUMP) + { + Offset offset = read_int(j->citer); + j->citer += offset; + } + else if(instr==ND_JUMP) + { + Offset offset = read_int(j->citer); + ctx.push_back(*j); + ctx.back().citer += offset; + } + else if(instr==GROUP_BEGIN) + { + Index n = read_int(j->citer); + if(!j->groups[n].match) + j->groups[n].begin = i-str.begin(); + } + else if(instr==GROUP_END) + { + Index n = read_int(j->citer); + if(!j->groups[n].match) + { + j->groups[n].match = true; + j->groups[n].end = i-str.begin(); + j->groups[n].length = j->groups[n].end-j->groups[n].begin; + } + + if(n==0) + { + result = true; + bool better = false; + for(unsigned k=0; (kgroups[k], groups[k]); + if(group_compare(groups[k], j->groups[k])) + break; + } + if(better) + groups = j->groups; + } + } + else + { + bool match_result = false; + bool input_consumed = false; + if(instr==MATCH_BEGIN) + match_result = (i==str.begin()); + else if(instr==MATCH_END) + match_result = (i==str.end()); + else if(instr==MATCH_CHAR) + { + match_result = (c==*j->citer++); + input_consumed = true; + } + else if(instr==MATCH_RANGE) + { + unsigned char first = *j->citer++; + unsigned char last = *j->citer++; + match_result = (c>=first && c<=last); + input_consumed = true; + } + else if(instr==MATCH_MASK) + { + if(c>=0 && c<=0xFF) + { + unsigned char m = *(j->citer+(c>>3)); + match_result = m&(1<<(c&7)); + } + input_consumed = true; + j->citer += 32; + } + else if(instr==MATCH_ANY) + { + match_result = true; + input_consumed = true; + } + else + throw Exception("Invalid instruction"); + + if(match_result==negate_match) + terminate = true; + negate_match = false; + + if(input_consumed || terminate) + break; + } + } + + if(terminate || j->citer==code.end()) + j = ctx.erase(j); + else + ++j; + } + + if(i==str.end() || ctx.empty()) + break; + ++i; + } + + return result; +} + +bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const +{ + if(!g1.match) + return false; + + // Any match is better than no match + if(!g2.match) + return true; + + // Earlier match is better + if(g1.beging2.begin) + return false; + + // Longer match at same position is better + return g1.end>g2.end; +} + +string Regex::disassemble() const +{ + ostringstream ss; + + for(Code::const_iterator i=code.begin(); i!=code.end();) + { + Code::const_iterator j = i; + Offset offset = i-code.begin(); + string decompiled = disassemble_instruction(i); + string bytes; + for(; j!=i; ++j) + bytes += format(" %02X", static_cast(*j)&0xFF); + ss<9) + ss<<"\n"<(*i++); + + ostringstream result; + switch(instr) + { + case JUMP: + { + Offset offset = read_int(i); + result<<"JUMP "<(i); + result<<"ND_JUMP "<(i); + break; + case GROUP_END: + result<<"GROUP_END "<(i); + break; + case NEGATE: + result<<"NEGATE"; + break; + case MATCH_BEGIN: + result<<"MATCH_BEGIN"; + break; + case MATCH_END: + result<<"MATCH_END"; + break; + case MATCH_CHAR: + { + char c = *i++; + result<<"MATCH_CHAR "; + if(c>=0x20 && c<=0x7E) + result<<'\''<(c)&0xFF); + } + break; + case MATCH_RANGE: + result<<"MATCH_RANGE "<<(static_cast(*i++)&0xFF); + result<<'-'<<(static_cast(*i++)&0xFF); + break; + case MATCH_MASK: + result<<"MATCH_MASK"; + for(unsigned j=0; j<32; ++j) + result<<' '<(*i++)&0xFF); + break; + case MATCH_ANY: + result<<"MATCH_ANY"; + break; + default: + result<<"UNKNOWN "<