From: Mikko Rasa Date: Fri, 25 May 2007 17:05:52 +0000 (+0000) Subject: Add glob and regex thingies X-Git-Tag: strings-1.0~18 X-Git-Url: http://git.tdb.fi/?a=commitdiff_plain;h=381c89769c0ffaa14e6a92c662323b5c45e1eba3;p=libs%2Fcore.git Add glob and regex thingies error.h is now in mspcore --- diff --git a/source/codec.h b/source/codec.h index 1f01271..b194c3f 100644 --- a/source/codec.h +++ b/source/codec.h @@ -9,7 +9,7 @@ Distributed under the LGPL #define MSP_STRINGS_CODEC_H_ #include -#include +#include namespace Msp { diff --git a/source/fmt.cpp b/source/fmt.cpp index 05f9be5..de0d810 100644 --- a/source/fmt.cpp +++ b/source/fmt.cpp @@ -4,8 +4,7 @@ This file is part of libmspstrings Copyright © 2006-2007 Mikko Rasa Distributed under the LGPL */ - -#include +#include #include "fmt.h" using namespace std; diff --git a/source/glob.cpp b/source/glob.cpp new file mode 100644 index 0000000..0d77c8d --- /dev/null +++ b/source/glob.cpp @@ -0,0 +1,61 @@ +#include "glob.h" + +using namespace std; + +namespace { + +template +bool cmp(char c, char h); + +template<> +bool cmp(char c, char h) +{ return tolower(c)==tolower(h); } + +template<> +bool cmp(char c, char h) +{ return c==h; } + +template +bool globmatch(string::const_iterator pat_i, const string::const_iterator &pat_e, string::const_iterator str_i, const string::const_iterator &str_e) +{ + while(pat_i!=pat_e && str_i!=str_e) + { + if(*pat_i=='?' || cmp(*str_i, *pat_i)) + { + ++pat_i; + ++str_i; + } + else if(*pat_i=='*') + { + ++pat_i; + if(pat_i==pat_e) + return true; + + for(; str_i!=str_e; ++str_i) + if(cmp(*str_i, *pat_i) && globmatch(pat_i, pat_e, str_i, str_e)) + return true; + + return false; + } + else + return false; + } + + return pat_i==pat_e && str_i==str_e; +} + +} + +namespace Msp { + +bool globmatch(const string &pat, const string &str) +{ + return ::globmatch(pat.begin(), pat.end(), str.begin(), str.end()); +} + +bool globcasematch(const string &pat, const string &str) +{ + return ::globmatch(pat.begin(), pat.end(), str.begin(), str.end()); +} + +} // namespace Msp diff --git a/source/glob.h b/source/glob.h new file mode 100644 index 0000000..a58d0b1 --- /dev/null +++ b/source/glob.h @@ -0,0 +1,13 @@ +#ifndef MSP_STRINGS_GLOB_H_ +#define MSP_STRINGS_GLOB_H_ + +#include + +namespace Msp { + +bool globmatch(const std::string &, const std::string &); +bool globcasematch(const std::string &, const std::string &); + +} // namespace Msp + +#endif diff --git a/source/lexicalcast.h b/source/lexicalcast.h index 776eaab..c73856b 100644 --- a/source/lexicalcast.h +++ b/source/lexicalcast.h @@ -10,7 +10,7 @@ Distributed under the LGPL #include #include -#include +#include #include "fmt.h" namespace Msp { diff --git a/source/regex.cpp b/source/regex.cpp new file mode 100644 index 0000000..839435e --- /dev/null +++ b/source/regex.cpp @@ -0,0 +1,582 @@ +#include +#include +#include "formatter.h" +#include "regex.h" + +using namespace std; + +#include + +namespace { + +/** +Writes an integer to a Regex code string, in little-endian order. +*/ +template +void write_int(T n, Msp::Regex::Code &code) +{ + for(unsigned i=0; i>i*8)&0xFF; +} + +/** +Reads an integer from a Regex code stream, in little-endian order. +*/ +template +T read_int(Msp::Regex::Code::const_iterator &c) +{ + T result=0; + for(unsigned i=0; i(*c++)<(*j)&0xFF); + ss<9) + ss<<"\n"<0) + throw InvalidParameterValue("Unmatched ("); + + Code result; + + unsigned this_group=group; + if(!branch) + { + result+=GROUP_BEGIN; + write_int(this_group, result); + } + + const unsigned jump_size=1+sizeof(Offset); + + if(!has_branches) + { + for(string::const_iterator i=iter; i!=end;) + { + Code atom=parse_atom(expr, i, group); + + Count repeat_min=1; + Count repeat_max=1; + parse_repeat(i, repeat_min, repeat_max); + + for(unsigned j=0; j::max()) + { + if(repeat_min==0) + { + result+=ND_JUMP; + write_int(atom.size()+jump_size, result); + result+=atom; + } + result+=ND_JUMP; + write_int(-(atom.size()+jump_size), result); + } + else if(repeat_max>repeat_min) + { + for(unsigned j=repeat_min; j((repeat_max-j)*(atom.size()+jump_size)-jump_size, result); + result+=atom; + } + } + } + } + else + { + list branches; + for(string::const_iterator i=iter;;) + { + branches.push_back(compile(expr, i, group, true)); + if(i==end) + break; + ++i; + } + + unsigned n_branches=branches.size(); + + Offset offset=(n_branches-1)*jump_size+branches.front().size(); + for(list::iterator i=++branches.begin(); i!=branches.end(); ++i) + { + result+=ND_JUMP; + write_int(offset, result); + offset+=i->size(); + } + + for(list::iterator i=branches.begin(); i!=branches.end();) + { + result+=*i; + offset-=i->size()+jump_size; + ++i; + if(i!=branches.end()) + { + result+=JUMP; + write_int(offset, result); + } + } + } + + if(!branch) + { + result+=GROUP_END; + write_int(this_group, result); + } + + iter=end; + + return result; +} + +Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group) +{ + Code result; + + if(i==expr.end()) + return result; + + bool flag=false; + if(*i=='\\') + { + if(++i==expr.end()) + throw InvalidParameterValue("Stray backslash"); + flag=true; + } + + if(!flag) + { + if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')') + throw InvalidParameterValue("Invalid atom"); + else if(*i=='[') + return parse_brackets(expr, i); + else if(*i=='.') + result+=MATCH_ANY; + else if(*i=='^') + result+=MATCH_BEGIN; + else if(*i=='$') + result+=MATCH_END; + else if(*i=='(') + { + ++group; + result=compile(expr, ++i, group, false); + } + else + flag=true; + } + + if(flag) + { + if(static_cast(*i)<=LAST_INSTRUCTION_) + result+=MATCH_CHAR; + result+=*i; + } + + ++i; + + return result; +} + +bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax) +{ + if(*i!='*' && *i!='+' && *i!='?' && *i!='{') + return false; + + if(*i=='*' || *i=='+') + rmax=numeric_limits::max(); + if(*i=='*' || *i=='?') + rmin=0; + if(*i=='{') + { + rmin=0; + for(++i; isdigit(*i); ++i) + rmin=rmin*10+(*i-'0'); + + if(*i==',') + { + ++i; + if(*i!='}') + { + rmax=0; + for(; isdigit(*i); ++i) + rmax=rmax*10+(*i-'0'); + if(rmax::max(); + } + else + rmax=rmin; + if(*i!='}') + throw InvalidParameterValue("Invalid bound"); + } + + ++i; + + return true; +} + +Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter) +{ + Code result; + + ++iter; + bool neg=false; + if(*iter=='^') + { + neg=true; + ++iter; + } + + string::const_iterator end=iter; + for(; (end!=str.end() && *end!=']'); ++end); + if(end==str.end()) + throw InvalidParameterValue("Unmatched '['"); + + uint8_t mask[32]={0}; + unsigned type=0; + bool range=false; + unsigned char first, last; + for(string::const_iterator i=iter; i!=end; ++i) + { + unsigned char c=*i; + if(range) + { + last=c; + for(unsigned j=first; j<=c; ++j) + mask[j>>3]|=1<<(j&7); + range=false; + if(type<2) + type=2; + } + else if(c=='-' && i!=iter && end-i>1) + range=true; + else + { + first=c; + mask[c>>3]|=1<<(c&7); + if(type==0) + type=1; + else + type=3; + } + } + + if(neg) + result+=NEGATE; + + if(type==1) + { + result+=MATCH_CHAR; + result+=first; + } + else if(type==2) + { + result+=MATCH_RANGE; + result+=first; + result+=last; + } + else + { + result+=MATCH_MASK; + result.append(reinterpret_cast(mask), 32); + } + + iter=end; + ++iter; + + return result; +} + +bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const +{ + bool result=false; + list ctx; + ctx.push_back(RunContext()); + ctx.front().citer=code.begin(); + ctx.front().groups.resize(groups.size()); + + for(string::const_iterator i=begin;;) + { + int c; + if(i!=str.end()) + c=static_cast(*i); + else + c=-1; + + for(list::iterator j=ctx.begin(); j!=ctx.end();) + { + bool terminate=false; + bool negate_match=false; + for(; j->citer!=code.end();) + { + Instruction instr=static_cast(*j->citer); + if(instr>LAST_INSTRUCTION_) + instr=MATCH_CHAR; + else + ++j->citer; + + if(instr==NEGATE) + negate_match=true; + else if(instr==JUMP) + { + Offset offset=read_int(j->citer); + j->citer+=offset; + } + else if(instr==ND_JUMP) + { + Offset offset=read_int(j->citer); + ctx.push_back(*j); + ctx.back().citer+=offset; + } + else if(instr==GROUP_BEGIN) + { + Index n=read_int(j->citer); + if(!j->groups[n].match) + j->groups[n].begin=i-str.begin(); + } + else if(instr==GROUP_END) + { + Index n=read_int(j->citer); + if(!j->groups[n].match) + { + j->groups[n].match=true; + j->groups[n].end=i-str.begin(); + j->groups[n].length=j->groups[n].end-j->groups[n].begin; + } + + if(n==0) + { + result=true; + bool better=false; + for(unsigned k=0; (kgroups[k], groups[k]); + if(group_compare(groups[k], j->groups[k])) + break; + } + if(better) + groups=j->groups; + } + } + else + { + bool match_result=false; + bool input_consumed=false; + if(instr==MATCH_BEGIN) + match_result=(i==str.begin()); + else if(instr==MATCH_END) + match_result=(i==str.end()); + else if(instr==MATCH_CHAR) + { + match_result=(c==*j->citer++); + input_consumed=true; + } + else if(instr==MATCH_RANGE) + { + unsigned char first=*j->citer++; + unsigned char last=*j->citer++; + match_result=(c>=first && c<=last); + input_consumed=true; + } + else if(instr==MATCH_MASK) + { + uint8_t mask[32]; + for(unsigned k=0; k<32; ++k) + mask[k]=*j->citer++; + match_result=mask[c>>3]&(1<<(c&7)); + input_consumed=true; + } + else if(instr==MATCH_ANY) + { + match_result=true; + input_consumed=true; + } + else + throw Exception("Invalid instruction"); + + if(match_result==negate_match) + terminate=true; + negate_match=false; + + if(input_consumed || terminate) + break; + } + } + + if(terminate || j->citer==code.end()) + j=ctx.erase(j); + else + ++j; + } + + if(i==str.end() || ctx.empty()) + break; + ++i; + } + + return result; +} + +bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const +{ + if(!g1.match) + return false; + + // Any match is better than no match + if(!g2.match) + return true; + + // Earlier match is better + if(g1.beging2.begin) + return false; + + // Longer match at same position is better + return g1.end>g2.end; +} + +string Regex::disassemble_instruction(Code::const_iterator &i) const +{ + Instruction instr=static_cast(*i); + if(instr>=LAST_INSTRUCTION_) + instr=MATCH_CHAR; + else + ++i; + + ostringstream result; + switch(instr) + { + case JUMP: + { + Offset offset=read_int(i); + result<<"JUMP "<(i); + result<<"ND_JUMP "<(i); + break; + case GROUP_END: + result<<"GROUP_END "<(i); + break; + case NEGATE: + result<<"NEGATE"; + break; + case MATCH_BEGIN: + result<<"MATCH_BEGIN"; + break; + case MATCH_END: + result<<"MATCH_END"; + break; + case MATCH_CHAR: + { + char c=*i++; + result<<"MATCH_CHAR "; + if(c>=0x20 && c<=0x7E) + result<<'\''<(c)&0xFF); + } + break; + case MATCH_RANGE: + result<<"MATCH_RANGE "<<(static_cast(*i++)&0xFF); + result<<'-'<<(static_cast(*i++)&0xFF); + break; + case MATCH_MASK: + result<<"MATCH_MASK"; + for(unsigned j=0; j<32; ++j) + result<<' '<(*i++)&0xFF); + break; + case MATCH_ANY: + result<<"MATCH_ANY"; + break; + case FIRST_INSTRUCTION_: + case LAST_INSTRUCTION_: + result<<"UNKNOWN "< +#include "regmatch.h" + +namespace Msp { + +/** +This class provides regular expression matching. It supports a subset of +POSIX.2 extended regex syntax. Character classes, equivalence classes and +collating elements are not supported. Refer to the regex(7) manpage for more +details. + +A description of the internal workings of the class follows. You may skip this +if you are not a developer or otherwise interested. + +The core of this class is a virtual machine running a non-deterministic finite +automaton (NFA). The state of the automaton is represented by an iterator into +the code. Transitions between states are represented by instructions. +Instructions may take multiple bytes, so a valid code iterator may not be a +valid NFA state. + +The virtual machine may have any number of execution contexts at any one time. +On every cycle, each context is advanced until an input-consuming instruction +is encountered, keeping the input positions in sync with each other. Execution +continues as long as there's at least one context remaining. + +The GROUP_BEGIN and GROUP_END instructions record the begin and end offset of a +match group, respectively. The GROUP_END instruction also marks the group as +successfully matched. If the target group was already matched, these +instructions do nothing. + +The JUMP instruction causes the execution iterator to be modified by an offset. + +The ND_JUMP instruction causes the execution context to be split in two. One +continues directly after the instruction and the other continues at an offset. + +The NEGATE instruction causes the result of the next match instruction to be +negated. + +Match instructions compare the input against a condition. If the match +succeeds, execution continues at the next instruction. If the match fails, +execution of that context is terminated. + +The MATCH_BEGIN and MATCH_END instructions match the beginning and end of +the input string, respectively. They do not consume input. + +The MATCH_CHAR instruction consumes the input character and matches it against +a single character. Since regexes often match sequences of printable character, +a match for such a character may be encoded as the character itself. + +The MATCH_RANGE instruction consumes the input character and matches it against +an inclusive character range. + +The MATCH_MASK instruction consumes the input character and matches it against +a bitmask. + +The MATCH_ANY instruction consumes the input character and always succeeds. +*/ +class Regex +{ +public: + /** + Constructs a new Regex object. + */ + Regex(const std::string &expr); + + /** + Matches the regex against a string. Refer to RegMatch documentation for + more information on the resulting object. + */ + RegMatch match(const std::string &str) const; + + /** + Returns a disassembled representation of the NFA bytecode. For debugging + purposes. + */ + std::string disassemble() const; +private: + typedef std::string Code; + typedef unsigned short Count; + typedef short Offset; + typedef unsigned short Index; + + enum Instruction + { + FIRST_INSTRUCTION_=0, + + JUMP, + ND_JUMP, + + GROUP_BEGIN, + GROUP_END, + + NEGATE, + + MATCH_BEGIN, + MATCH_END, + MATCH_CHAR, + MATCH_RANGE, + MATCH_MASK, + MATCH_ANY, + + LAST_INSTRUCTION_=31 + }; + + struct RunContext + { + Code::const_iterator citer; + RegMatch::GroupArray groups; + }; + + Code code; + unsigned n_groups; + + /** + Compiles a regular expression into NFA bytecode. When compiling a whole + regex, \a group should be set to 0. When the function returns, \a group will + be the index of the last subexpression and \a iter will point to the first + unused character in the expression. + + \param expr Expression to be compiled + \param begin Iterator into the expression + \param group Group counter, gets incremented for each subregex + \param branch Whether we are compiling a branch + + \return Compiled NFA bytecode + */ + Code compile(const std::string &expr, std::string::const_iterator &iter, unsigned &group, bool branch); + Code parse_atom(const std::string &, std::string::const_iterator &i, unsigned &); + Code parse_brackets(const std::string &, std::string::const_iterator &); + bool parse_repeat(std::string::const_iterator &, Count &, Count &); + bool run(const std::string &, const std::string::const_iterator &, RegMatch::GroupArray &) const; + bool group_compare(const RegMatch::Group &, const RegMatch::Group &) const; + std::string disassemble_instruction(Code::const_iterator &) const; +}; + +} // namespace Msp + +#endif diff --git a/source/regmatch.cpp b/source/regmatch.cpp new file mode 100644 index 0000000..f08f08a --- /dev/null +++ b/source/regmatch.cpp @@ -0,0 +1,26 @@ +#include +#include "regmatch.h" + +using namespace std; + +namespace Msp { + +RegMatch::RegMatch(const string &str, const GroupArray &g): + groups(g) +{ + for(GroupArray::iterator i=groups.begin(); i!=groups.end(); ++i) + if(i->match) + { + i->length=i->end-i->begin; + i->str=str.substr(i->begin, i->length); + } +} + +const RegMatch::Group &RegMatch::group(unsigned i) const +{ + if(i>=groups.size()) + throw InvalidParameterValue("Group index out of range"); + return groups[i]; +} + +} // namespace Msp diff --git a/source/regmatch.h b/source/regmatch.h new file mode 100644 index 0000000..f5a774e --- /dev/null +++ b/source/regmatch.h @@ -0,0 +1,87 @@ +#ifndef MSP_STRINGS_REGMATCH_H_ +#define MSP_STRINGS_REGMATCH_H_ + +#include +#include + +namespace Msp { + +/** +This class stores the result of a Regex being matched against a string. If the +match was successful, the RegMatch object evaluates to true, allowing it to be +used in constructs like \code if(RegMatch match=regex.match("foo")) \endcode. + +A RegMatch representing a successful match has one or more groups, indicating +matching parts of the string. The first group (with index 0) indicates the +part matched by the whol regex. Further groups, if present, indicate parts +matched by subregexes. These are ordered from left to right, by the opening +parenthesis of the subregex. +*/ +class RegMatch +{ +public: + /** + A single subregex of the match. + */ + struct Group + { + bool match; //< Whether or not this group matched + unsigned begin; //< First offset of the match + unsigned end; //< One-past-last offset + unsigned length; //< Length of the match (end-begin) + std::string str; //< The part of the string that matched + + Group(): match(false) { } + }; + typedef std::vector GroupArray; + + /** + Constructs a RegMatch representig a non-match. Used by Regex. + */ + RegMatch() { } + + /** + Constructs a new RegMatch from a string and groups. The length and str members + of each group are computed and need not be set. Used by Regex. + */ + RegMatch(const std::string &, const std::vector &); + + /** + Returns a reference to a single group in the match. An exception is thrown + if the requested group does not exist. + */ + const Group &group(unsigned) const; + + /** + Returns true if the RegMatch object represents a non-match. + */ + bool empty() const { return groups.empty(); } + + /** + Returns the number of groups in this match. + */ + unsigned size() const { return groups.size(); } + + /** + Returns the begin offset of the whole match. + */ + unsigned begin() const { return groups.empty()?0:groups[0].begin; } + + /** + Returns the end offset of the whole match. + */ + unsigned end() const { return groups.empty()?0:groups[0].end; } + + /** + Shortcut for the group() function. + */ + const Group &operator[](unsigned i) const { return group(i); } + + operator bool() const { return !empty(); } +private: + std::vector groups; +}; + +} // namespace Msp + +#endif