X-Git-Url: http://git.tdb.fi/?p=libs%2Fcore.git;a=blobdiff_plain;f=source%2Fstrings%2Fregex.cpp;h=9167c6b8a9e6ae0f75fd52206bb96efe45a9424b;hp=f45c820c80ddadf87b6e621645f02cd965ac6cdc;hb=f24e7b96e76b63c9b9b8a6bce4c7a9db64276ea8;hpb=b42ed73a1b241c0e93ee03c43c4584b41c549bac diff --git a/source/strings/regex.cpp b/source/strings/regex.cpp index f45c820..9167c6b 100644 --- a/source/strings/regex.cpp +++ b/source/strings/regex.cpp @@ -1,14 +1,7 @@ -/* $Id$ - -This file is part of libmspstrings -Copyright © 2007 Mikko Rasa -Distributed under the LGPL -*/ - -#include #include -#include -#include "formatter.h" +#include +#include +#include "format.h" #include "regex.h" using namespace std; @@ -17,19 +10,19 @@ namespace { /** Writes an integer to a Regex code string, in little-endian order. */ template -void write_int(T n, Msp::Regex::Code &code) +void write_int(T n, basic_string &code) { for(unsigned i=0; i>i*8)&0xFF; + code += (n>>(i*8))&0xFF; } -/** Reads an integer from a Regex code stream, in little-endian order. */ +/** Reads an integer from a Regex code string, in little-endian order. */ template -T read_int(Msp::Regex::Code::const_iterator &c) +T read_int(basic_string::const_iterator &c) { T result = 0; for(unsigned i=0; i(*c++)<40) + { + result = e.substr(offset-40, 60); + offset = 40; + } + else + result = e.substr(0, 60); + result += '\n'; + result.append(offset, ' '); + result += '^'; + return result; +} + + Regex::Regex(const string &expr) { n_groups = 0; - string::const_iterator iter = expr.begin(); + auto iter = expr.begin(); code = compile(expr, iter, n_groups, false); ++n_groups; } @@ -49,7 +64,7 @@ Regex::Regex(const string &expr) Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch) { bool has_branches = false; - unsigned level = 0; + stack parens; bool escape = false; unsigned bracket = 0; string::const_iterator end; @@ -69,19 +84,19 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns else if(*end=='\\') escape = true; else if(*end=='(') - ++level; + parens.push(end); else if(*end==')') { - if(level==0) + if(parens.empty()) { if(group==0) - throw InvalidParameterValue("Unexpected )"); + throw bad_regex("unmatched ')'", expr, end); else break; } - --level; + parens.pop(); } - else if(*end=='|' && level==0) + else if(*end=='|' && parens.empty()) { if(branch) break; @@ -92,8 +107,8 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns bracket = 1; } - if(level>0) - throw InvalidParameterValue("Unmatched ("); + if(!parens.empty()) + throw bad_regex("unmatched '('", expr, parens.top()); Code result; @@ -108,13 +123,13 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns if(!has_branches) { - for(string::const_iterator i=iter; i!=end;) + for(auto i=iter; i!=end;) { Code atom = parse_atom(expr, i, group); Count repeat_min = 1; Count repeat_max = 1; - parse_repeat(i, repeat_min, repeat_max); + parse_repeat(expr, i, repeat_min, repeat_max); for(unsigned j=0; j branches; - for(string::const_iterator i=iter;;) + for(auto i=iter;;) { branches.push_back(compile(expr, i, group, true)); if(i==end) @@ -154,14 +169,14 @@ Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, uns unsigned n_branches = branches.size(); Offset offset = (n_branches-1)*jump_size+branches.front().size(); - for(list::iterator i=++branches.begin(); i!=branches.end(); ++i) + for(auto i=++branches.begin(); i!=branches.end(); ++i) { result += ND_JUMP; write_int(offset, result); offset += i->size(); } - for(list::iterator i=branches.begin(); i!=branches.end();) + for(auto i=branches.begin(); i!=branches.end();) { result += *i; offset -= i->size()+jump_size; @@ -196,14 +211,14 @@ Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, uns if(*i=='\\') { if(++i==expr.end()) - throw InvalidParameterValue("Stray backslash"); + throw bad_regex("stray backslash", expr, i-1); flag = true; } if(!flag) { if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')') - throw InvalidParameterValue("Invalid atom"); + throw bad_regex("invalid atom", expr, i); else if(*i=='[') return parse_brackets(expr, i); else if(*i=='.') @@ -232,7 +247,7 @@ Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, uns return result; } -bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) +bool Regex::parse_repeat(const string &expr, string::const_iterator &i, Count &rmin, Count &rmax) { if(*i!='*' && *i!='+' && *i!='?' && *i!='{') return false; @@ -243,6 +258,8 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) rmin = 0; if(*i=='{') { + auto begin = i; + rmin = 0; for(++i; isdigit(*i); ++i) rmin = rmin*10+(*i-'0'); @@ -256,7 +273,7 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) for(; isdigit(*i); ++i) rmax = rmax*10+(*i-'0'); if(rmax::max(); @@ -264,7 +281,7 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) else rmax = rmin; if(*i!='}') - throw InvalidParameterValue("Invalid bound"); + throw bad_regex("invalid bound", expr, begin); } ++i; @@ -274,6 +291,7 @@ bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter) { + auto begin = iter; Code result; ++iter; @@ -284,16 +302,16 @@ Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &ite ++iter; } - string::const_iterator end = iter; + auto end = iter; for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ; if(end==str.end()) - throw InvalidParameterValue("Unmatched '['"); + throw bad_regex("unmatched '['", str, begin); unsigned char mask[32] = {0}; unsigned type = 0; bool range = false; - unsigned char first=0, last = 0; - for(string::const_iterator i=iter; i!=end; ++i) + unsigned char first = 0, last = 0; + for(auto i=iter; i!=end; ++i) { unsigned char c = *i; if(range) @@ -335,7 +353,7 @@ Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &ite else { result += MATCH_MASK; - result.append(reinterpret_cast(mask), 32); + result.append(mask, 32); } iter = end; @@ -346,16 +364,16 @@ Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &ite RegMatch Regex::match(const string &str) const { - RegMatch::GroupArray groups(n_groups); + vector groups(n_groups); - for(string::const_iterator i=str.begin(); i!=str.end(); ++i) + for(auto i=str.begin(); i!=str.end(); ++i) if(run(str, i, groups)) return RegMatch(str, groups); return RegMatch(); } -bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const +bool Regex::run(const string &str, const string::const_iterator &begin, vector &groups) const { bool result = false; list ctx; @@ -363,15 +381,15 @@ bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch ctx.front().citer = code.begin(); ctx.front().groups.resize(groups.size()); - for(string::const_iterator i=begin;;) + for(auto i=begin;;) { int c; if(i!=str.end()) - c = static_cast(*i); + c = *i; else c = -1; - for(list::iterator j=ctx.begin(); j!=ctx.end();) + for(auto j=ctx.begin(); j!=ctx.end();) { bool terminate = false; bool negate_match = false; @@ -458,7 +476,7 @@ bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch input_consumed = true; } else - throw Exception("Invalid instruction"); + throw logic_error("invalid instruction in regex bytecode"); if(match_result==negate_match) terminate = true; @@ -495,7 +513,7 @@ bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) // Earlier match is better if(g1.beging2.begin) + if(g1.begin>g2.begin) return false; // Longer match at same position is better @@ -504,86 +522,79 @@ bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) string Regex::disassemble() const { - ostringstream ss; + string result; - for(Code::const_iterator i=code.begin(); i!=code.end();) + for(auto i=code.begin(); i!=code.end();) { - Code::const_iterator j = i; + auto j = i; Offset offset = i-code.begin(); string decompiled = disassemble_instruction(i); string bytes; for(; j!=i; ++j) - bytes += format(" %02X", static_cast(*j)&0xFF); - ss<9) - ss<<"\n"<(*i++); - ostringstream result; switch(instr) { case JUMP: { Offset offset = read_int(i); - result<<"JUMP "<(i); - result<<"ND_JUMP "<(i); - break; + return format("GROUP_BEGIN %d", read_int(i)); case GROUP_END: - result<<"GROUP_END "<(i); - break; + return format("GROUP_END %d", read_int(i)); case NEGATE: - result<<"NEGATE"; - break; + return "NEGATE"; case MATCH_BEGIN: - result<<"MATCH_BEGIN"; - break; + return "MATCH_BEGIN"; case MATCH_END: - result<<"MATCH_END"; - break; + return "MATCH_END"; case MATCH_CHAR: { - char c = *i++; - result<<"MATCH_CHAR "; + unsigned char c = *i++; if(c>=0x20 && c<=0x7E) - result<<'\''<(c)&0xFF); + return format("MATCH_CHAR %d", c); } break; case MATCH_RANGE: - result<<"MATCH_RANGE "<<(static_cast(*i++)&0xFF); - result<<'-'<<(static_cast(*i++)&0xFF); - break; + { + int begin = *i++; + int end = *i++; + return format("MATCH_RANGE %d-%d", begin, end); + } case MATCH_MASK: - result<<"MATCH_MASK"; - for(unsigned j=0; j<32; ++j) - result<<' '<(*i++)&0xFF); - break; + { + string result = "MATCH_MASK"; + for(unsigned j=0; j<32; ++j) + result += format(" %02X", *i++); + return result; + } case MATCH_ANY: - result<<"MATCH_ANY"; - break; + return "MATCH_ANY"; default: - result<<"UNKNOWN "<