X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fregex.cpp;h=f45c820c80ddadf87b6e621645f02cd965ac6cdc;hp=7e9c8b0c713c11965f9d5f6b7c8afad4cc696926;hb=5b1368cb791cab043f0435628cacbaff36e39b7b;hpb=36f9e78ae75f5e14b132f37d249340ad3480b8ce diff --git a/source/regex.cpp b/source/regex.cpp index 7e9c8b0..f45c820 100644 --- a/source/regex.cpp +++ b/source/regex.cpp @@ -15,94 +15,59 @@ using namespace std; namespace { -/** -Writes an integer to a Regex code string, in little-endian order. -*/ +/** Writes an integer to a Regex code string, in little-endian order. */ template void write_int(T n, Msp::Regex::Code &code) { for(unsigned i=0; i>i*8)&0xFF; + code += (n>>i*8)&0xFF; } -/** -Reads an integer from a Regex code stream, in little-endian order. -*/ +/** Reads an integer from a Regex code stream, in little-endian order. */ template T read_int(Msp::Regex::Code::const_iterator &c) { - T result=0; + T result = 0; for(unsigned i=0; i(*c++)<(*c++)<(*j)&0xFF); - ss<9) - ss<<"\n"<0) @@ -132,45 +97,45 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns Code result; - unsigned this_group=group; + unsigned this_group = group; if(!branch) { - result+=GROUP_BEGIN; + result += GROUP_BEGIN; write_int(this_group, result); } - const unsigned jump_size=1+sizeof(Offset); + const unsigned jump_size = 1+sizeof(Offset); if(!has_branches) { for(string::const_iterator i=iter; i!=end;) { - Code atom=parse_atom(expr, i, group); + Code atom = parse_atom(expr, i, group); - Count repeat_min=1; - Count repeat_max=1; + Count repeat_min = 1; + Count repeat_max = 1; parse_repeat(i, repeat_min, repeat_max); for(unsigned j=0; j::max()) { if(repeat_min==0) { - result+=ND_JUMP; + result += ND_JUMP; write_int(atom.size()+jump_size, result); - result+=atom; + result += atom; } - result+=ND_JUMP; + result += ND_JUMP; write_int(-(atom.size()+jump_size), result); } else if(repeat_max>repeat_min) { for(unsigned j=repeat_min; j((repeat_max-j)*(atom.size()+jump_size)-jump_size, result); - result+=atom; + result += atom; } } } @@ -186,24 +151,24 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns ++i; } - unsigned n_branches=branches.size(); + unsigned n_branches = branches.size(); - Offset offset=(n_branches-1)*jump_size+branches.front().size(); + Offset offset = (n_branches-1)*jump_size+branches.front().size(); for(list::iterator i=++branches.begin(); i!=branches.end(); ++i) { - result+=ND_JUMP; + result += ND_JUMP; write_int(offset, result); - offset+=i->size(); + offset += i->size(); } for(list::iterator i=branches.begin(); i!=branches.end();) { - result+=*i; - offset-=i->size()+jump_size; + result += *i; + offset -= i->size()+jump_size; ++i; if(i!=branches.end()) { - result+=JUMP; + result += JUMP; write_int(offset, result); } } @@ -211,11 +176,11 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns if(!branch) { - result+=GROUP_END; + result += GROUP_END; write_int(this_group, result); } - iter=end; + iter = end; return result; } @@ -227,12 +192,12 @@ Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, uns if(i==expr.end()) return result; - bool flag=false; + bool flag = false; if(*i=='\\') { if(++i==expr.end()) throw InvalidParameterValue("Stray backslash"); - flag=true; + flag = true; } if(!flag) @@ -242,24 +207,24 @@ Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, uns else if(*i=='[') return parse_brackets(expr, i); else if(*i=='.') - result+=MATCH_ANY; + result += MATCH_ANY; else if(*i=='^') - result+=MATCH_BEGIN; + result += MATCH_BEGIN; else if(*i=='$') - result+=MATCH_END; + result += MATCH_END; else if(*i=='(') { ++group; - result=compile(expr, ++i, group, false); + result = compile(expr, ++i, group, false); } else - flag=true; + flag = true; } if(flag) { - result+=MATCH_CHAR; - result+=*i; + result += MATCH_CHAR; + result += *i; } ++i; @@ -273,31 +238,31 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) return false; if(*i=='*' || *i=='+') - rmax=numeric_limits::max(); + rmax = numeric_limits::max(); if(*i=='*' || *i=='?') - rmin=0; + rmin = 0; if(*i=='{') { - rmin=0; + rmin = 0; for(++i; isdigit(*i); ++i) - rmin=rmin*10+(*i-'0'); + rmin = rmin*10+(*i-'0'); if(*i==',') { ++i; if(*i!='}') { - rmax=0; + rmax = 0; for(; isdigit(*i); ++i) - rmax=rmax*10+(*i-'0'); + rmax = rmax*10+(*i-'0'); if(rmax::max(); + rmax = numeric_limits::max(); } else - rmax=rmin; + rmax = rmin; if(*i!='}') throw InvalidParameterValue("Invalid bound"); } @@ -312,181 +277,192 @@ Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &ite Code result; ++iter; - bool neg=false; + bool neg = false; if(*iter=='^') { - neg=true; + neg = true; ++iter; } - string::const_iterator end=iter; + string::const_iterator end = iter; for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ; if(end==str.end()) throw InvalidParameterValue("Unmatched '['"); - unsigned char mask[32]={0}; - unsigned type=0; - bool range=false; - unsigned char first=0, last=0; + unsigned char mask[32] = {0}; + unsigned type = 0; + bool range = false; + unsigned char first=0, last = 0; for(string::const_iterator i=iter; i!=end; ++i) { - unsigned char c=*i; + unsigned char c = *i; if(range) { - last=c; + last = c; for(unsigned j=first; j<=c; ++j) - mask[j>>3]|=1<<(j&7); - range=false; + mask[j>>3] |= 1<<(j&7); + range = false; if(type<2) - type=2; + type = 2; } else if(c=='-' && i!=iter && end-i>1) - range=true; + range = true; else { - first=c; - mask[c>>3]|=1<<(c&7); + first = c; + mask[c>>3] |= 1<<(c&7); if(type==0) - type=1; + type = 1; else - type=3; + type = 3; } } if(neg) - result+=NEGATE; + result += NEGATE; if(type==1) { - result+=MATCH_CHAR; - result+=first; + result += MATCH_CHAR; + result += first; } else if(type==2) { - result+=MATCH_RANGE; - result+=first; - result+=last; + result += MATCH_RANGE; + result += first; + result += last; } else { - result+=MATCH_MASK; + result += MATCH_MASK; result.append(reinterpret_cast(mask), 32); } - iter=end; + iter = end; ++iter; return result; } +RegMatch Regex::match(const string &str) const +{ + RegMatch::GroupArray groups(n_groups); + + for(string::const_iterator i=str.begin(); i!=str.end(); ++i) + if(run(str, i, groups)) + return RegMatch(str, groups); + + return RegMatch(); +} + bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const { - bool result=false; + bool result = false; list ctx; ctx.push_back(RunContext()); - ctx.front().citer=code.begin(); + ctx.front().citer = code.begin(); ctx.front().groups.resize(groups.size()); for(string::const_iterator i=begin;;) { int c; if(i!=str.end()) - c=static_cast(*i); + c = static_cast(*i); else - c=-1; + c = -1; for(list::iterator j=ctx.begin(); j!=ctx.end();) { - bool terminate=false; - bool negate_match=false; + bool terminate = false; + bool negate_match = false; for(; j->citer!=code.end();) { - Instruction instr=static_cast(*j->citer++); + Instruction instr = static_cast(*j->citer++); if(instr==NEGATE) - negate_match=true; + negate_match = true; else if(instr==JUMP) { - Offset offset=read_int(j->citer); - j->citer+=offset; + Offset offset = read_int(j->citer); + j->citer += offset; } else if(instr==ND_JUMP) { - Offset offset=read_int(j->citer); + Offset offset = read_int(j->citer); ctx.push_back(*j); - ctx.back().citer+=offset; + ctx.back().citer += offset; } else if(instr==GROUP_BEGIN) { - Index n=read_int(j->citer); + Index n = read_int(j->citer); if(!j->groups[n].match) - j->groups[n].begin=i-str.begin(); + j->groups[n].begin = i-str.begin(); } else if(instr==GROUP_END) { - Index n=read_int(j->citer); + Index n = read_int(j->citer); if(!j->groups[n].match) { - j->groups[n].match=true; - j->groups[n].end=i-str.begin(); - j->groups[n].length=j->groups[n].end-j->groups[n].begin; + j->groups[n].match = true; + j->groups[n].end = i-str.begin(); + j->groups[n].length = j->groups[n].end-j->groups[n].begin; } if(n==0) { - result=true; - bool better=false; + result = true; + bool better = false; for(unsigned k=0; (kgroups[k], groups[k]); + better = group_compare(j->groups[k], groups[k]); if(group_compare(groups[k], j->groups[k])) break; } if(better) - groups=j->groups; + groups = j->groups; } } else { - bool match_result=false; - bool input_consumed=false; + bool match_result = false; + bool input_consumed = false; if(instr==MATCH_BEGIN) - match_result=(i==str.begin()); + match_result = (i==str.begin()); else if(instr==MATCH_END) - match_result=(i==str.end()); + match_result = (i==str.end()); else if(instr==MATCH_CHAR) { - match_result=(c==*j->citer++); - input_consumed=true; + match_result = (c==*j->citer++); + input_consumed = true; } else if(instr==MATCH_RANGE) { - unsigned char first=*j->citer++; - unsigned char last=*j->citer++; - match_result=(c>=first && c<=last); - input_consumed=true; + unsigned char first = *j->citer++; + unsigned char last = *j->citer++; + match_result = (c>=first && c<=last); + input_consumed = true; } else if(instr==MATCH_MASK) { if(c>=0 && c<=0xFF) { - unsigned char m=*(j->citer+(c>>3)); - match_result=m&(1<<(c&7)); + unsigned char m = *(j->citer+(c>>3)); + match_result = m&(1<<(c&7)); } - input_consumed=true; - j->citer+=32; + input_consumed = true; + j->citer += 32; } else if(instr==MATCH_ANY) { - match_result=true; - input_consumed=true; + match_result = true; + input_consumed = true; } else throw Exception("Invalid instruction"); if(match_result==negate_match) - terminate=true; - negate_match=false; + terminate = true; + negate_match = false; if(input_consumed || terminate) break; @@ -494,7 +470,7 @@ bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch } if(terminate || j->citer==code.end()) - j=ctx.erase(j); + j = ctx.erase(j); else ++j; } @@ -526,22 +502,43 @@ bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) return g1.end>g2.end; } +string Regex::disassemble() const +{ + ostringstream ss; + + for(Code::const_iterator i=code.begin(); i!=code.end();) + { + Code::const_iterator j = i; + Offset offset = i-code.begin(); + string decompiled = disassemble_instruction(i); + string bytes; + for(; j!=i; ++j) + bytes += format(" %02X", static_cast(*j)&0xFF); + ss<9) + ss<<"\n"<(*i++); + Instruction instr = static_cast(*i++); ostringstream result; switch(instr) { case JUMP: { - Offset offset=read_int(i); + Offset offset = read_int(i); result<<"JUMP "<(i); + Offset offset = read_int(i); result<<"ND_JUMP "<=0x20 && c<=0x7E) result<<'\''<