X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fregex.cpp;fp=source%2Fregex.cpp;h=0000000000000000000000000000000000000000;hp=f45c820c80ddadf87b6e621645f02cd965ac6cdc;hb=b42ed73a1b241c0e93ee03c43c4584b41c549bac;hpb=5b1368cb791cab043f0435628cacbaff36e39b7b diff --git a/source/regex.cpp b/source/regex.cpp deleted file mode 100644 index f45c820..0000000 --- a/source/regex.cpp +++ /dev/null @@ -1,589 +0,0 @@ -/* $Id$ - -This file is part of libmspstrings -Copyright © 2007 Mikko Rasa -Distributed under the LGPL -*/ - -#include -#include -#include -#include "formatter.h" -#include "regex.h" - -using namespace std; - -namespace { - -/** Writes an integer to a Regex code string, in little-endian order. */ -template -void write_int(T n, Msp::Regex::Code &code) -{ - for(unsigned i=0; i>i*8)&0xFF; -} - -/** Reads an integer from a Regex code stream, in little-endian order. */ -template -T read_int(Msp::Regex::Code::const_iterator &c) -{ - T result = 0; - for(unsigned i=0; i(*c++)<0) - throw InvalidParameterValue("Unmatched ("); - - Code result; - - unsigned this_group = group; - if(!branch) - { - result += GROUP_BEGIN; - write_int(this_group, result); - } - - const unsigned jump_size = 1+sizeof(Offset); - - if(!has_branches) - { - for(string::const_iterator i=iter; i!=end;) - { - Code atom = parse_atom(expr, i, group); - - Count repeat_min = 1; - Count repeat_max = 1; - parse_repeat(i, repeat_min, repeat_max); - - for(unsigned j=0; j::max()) - { - if(repeat_min==0) - { - result += ND_JUMP; - write_int(atom.size()+jump_size, result); - result += atom; - } - result += ND_JUMP; - write_int(-(atom.size()+jump_size), result); - } - else if(repeat_max>repeat_min) - { - for(unsigned j=repeat_min; j((repeat_max-j)*(atom.size()+jump_size)-jump_size, result); - result += atom; - } - } - } - } - else - { - list branches; - for(string::const_iterator i=iter;;) - { - branches.push_back(compile(expr, i, group, true)); - if(i==end) - break; - ++i; - } - - unsigned n_branches = branches.size(); - - Offset offset = (n_branches-1)*jump_size+branches.front().size(); - for(list::iterator i=++branches.begin(); i!=branches.end(); ++i) - { - result += ND_JUMP; - write_int(offset, result); - offset += i->size(); - } - - for(list::iterator i=branches.begin(); i!=branches.end();) - { - result += *i; - offset -= i->size()+jump_size; - ++i; - if(i!=branches.end()) - { - result += JUMP; - write_int(offset, result); - } - } - } - - if(!branch) - { - result += GROUP_END; - write_int(this_group, result); - } - - iter = end; - - return result; -} - -Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group) -{ - Code result; - - if(i==expr.end()) - return result; - - bool flag = false; - if(*i=='\\') - { - if(++i==expr.end()) - throw InvalidParameterValue("Stray backslash"); - flag = true; - } - - if(!flag) - { - if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')') - throw InvalidParameterValue("Invalid atom"); - else if(*i=='[') - return parse_brackets(expr, i); - else if(*i=='.') - result += MATCH_ANY; - else if(*i=='^') - result += MATCH_BEGIN; - else if(*i=='$') - result += MATCH_END; - else if(*i=='(') - { - ++group; - result = compile(expr, ++i, group, false); - } - else - flag = true; - } - - if(flag) - { - result += MATCH_CHAR; - result += *i; - } - - ++i; - - return result; -} - -bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) -{ - if(*i!='*' && *i!='+' && *i!='?' && *i!='{') - return false; - - if(*i=='*' || *i=='+') - rmax = numeric_limits::max(); - if(*i=='*' || *i=='?') - rmin = 0; - if(*i=='{') - { - rmin = 0; - for(++i; isdigit(*i); ++i) - rmin = rmin*10+(*i-'0'); - - if(*i==',') - { - ++i; - if(*i!='}') - { - rmax = 0; - for(; isdigit(*i); ++i) - rmax = rmax*10+(*i-'0'); - if(rmax::max(); - } - else - rmax = rmin; - if(*i!='}') - throw InvalidParameterValue("Invalid bound"); - } - - ++i; - - return true; -} - -Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter) -{ - Code result; - - ++iter; - bool neg = false; - if(*iter=='^') - { - neg = true; - ++iter; - } - - string::const_iterator end = iter; - for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ; - if(end==str.end()) - throw InvalidParameterValue("Unmatched '['"); - - unsigned char mask[32] = {0}; - unsigned type = 0; - bool range = false; - unsigned char first=0, last = 0; - for(string::const_iterator i=iter; i!=end; ++i) - { - unsigned char c = *i; - if(range) - { - last = c; - for(unsigned j=first; j<=c; ++j) - mask[j>>3] |= 1<<(j&7); - range = false; - if(type<2) - type = 2; - } - else if(c=='-' && i!=iter && end-i>1) - range = true; - else - { - first = c; - mask[c>>3] |= 1<<(c&7); - if(type==0) - type = 1; - else - type = 3; - } - } - - if(neg) - result += NEGATE; - - if(type==1) - { - result += MATCH_CHAR; - result += first; - } - else if(type==2) - { - result += MATCH_RANGE; - result += first; - result += last; - } - else - { - result += MATCH_MASK; - result.append(reinterpret_cast(mask), 32); - } - - iter = end; - ++iter; - - return result; -} - -RegMatch Regex::match(const string &str) const -{ - RegMatch::GroupArray groups(n_groups); - - for(string::const_iterator i=str.begin(); i!=str.end(); ++i) - if(run(str, i, groups)) - return RegMatch(str, groups); - - return RegMatch(); -} - -bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const -{ - bool result = false; - list ctx; - ctx.push_back(RunContext()); - ctx.front().citer = code.begin(); - ctx.front().groups.resize(groups.size()); - - for(string::const_iterator i=begin;;) - { - int c; - if(i!=str.end()) - c = static_cast(*i); - else - c = -1; - - for(list::iterator j=ctx.begin(); j!=ctx.end();) - { - bool terminate = false; - bool negate_match = false; - for(; j->citer!=code.end();) - { - Instruction instr = static_cast(*j->citer++); - - if(instr==NEGATE) - negate_match = true; - else if(instr==JUMP) - { - Offset offset = read_int(j->citer); - j->citer += offset; - } - else if(instr==ND_JUMP) - { - Offset offset = read_int(j->citer); - ctx.push_back(*j); - ctx.back().citer += offset; - } - else if(instr==GROUP_BEGIN) - { - Index n = read_int(j->citer); - if(!j->groups[n].match) - j->groups[n].begin = i-str.begin(); - } - else if(instr==GROUP_END) - { - Index n = read_int(j->citer); - if(!j->groups[n].match) - { - j->groups[n].match = true; - j->groups[n].end = i-str.begin(); - j->groups[n].length = j->groups[n].end-j->groups[n].begin; - } - - if(n==0) - { - result = true; - bool better = false; - for(unsigned k=0; (kgroups[k], groups[k]); - if(group_compare(groups[k], j->groups[k])) - break; - } - if(better) - groups = j->groups; - } - } - else - { - bool match_result = false; - bool input_consumed = false; - if(instr==MATCH_BEGIN) - match_result = (i==str.begin()); - else if(instr==MATCH_END) - match_result = (i==str.end()); - else if(instr==MATCH_CHAR) - { - match_result = (c==*j->citer++); - input_consumed = true; - } - else if(instr==MATCH_RANGE) - { - unsigned char first = *j->citer++; - unsigned char last = *j->citer++; - match_result = (c>=first && c<=last); - input_consumed = true; - } - else if(instr==MATCH_MASK) - { - if(c>=0 && c<=0xFF) - { - unsigned char m = *(j->citer+(c>>3)); - match_result = m&(1<<(c&7)); - } - input_consumed = true; - j->citer += 32; - } - else if(instr==MATCH_ANY) - { - match_result = true; - input_consumed = true; - } - else - throw Exception("Invalid instruction"); - - if(match_result==negate_match) - terminate = true; - negate_match = false; - - if(input_consumed || terminate) - break; - } - } - - if(terminate || j->citer==code.end()) - j = ctx.erase(j); - else - ++j; - } - - if(i==str.end() || ctx.empty()) - break; - ++i; - } - - return result; -} - -bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const -{ - if(!g1.match) - return false; - - // Any match is better than no match - if(!g2.match) - return true; - - // Earlier match is better - if(g1.beging2.begin) - return false; - - // Longer match at same position is better - return g1.end>g2.end; -} - -string Regex::disassemble() const -{ - ostringstream ss; - - for(Code::const_iterator i=code.begin(); i!=code.end();) - { - Code::const_iterator j = i; - Offset offset = i-code.begin(); - string decompiled = disassemble_instruction(i); - string bytes; - for(; j!=i; ++j) - bytes += format(" %02X", static_cast(*j)&0xFF); - ss<9) - ss<<"\n"<(*i++); - - ostringstream result; - switch(instr) - { - case JUMP: - { - Offset offset = read_int(i); - result<<"JUMP "<(i); - result<<"ND_JUMP "<(i); - break; - case GROUP_END: - result<<"GROUP_END "<(i); - break; - case NEGATE: - result<<"NEGATE"; - break; - case MATCH_BEGIN: - result<<"MATCH_BEGIN"; - break; - case MATCH_END: - result<<"MATCH_END"; - break; - case MATCH_CHAR: - { - char c = *i++; - result<<"MATCH_CHAR "; - if(c>=0x20 && c<=0x7E) - result<<'\''<(c)&0xFF); - } - break; - case MATCH_RANGE: - result<<"MATCH_RANGE "<<(static_cast(*i++)&0xFF); - result<<'-'<<(static_cast(*i++)&0xFF); - break; - case MATCH_MASK: - result<<"MATCH_MASK"; - for(unsigned j=0; j<32; ++j) - result<<' '<(*i++)&0xFF); - break; - case MATCH_ANY: - result<<"MATCH_ANY"; - break; - default: - result<<"UNKNOWN "<