Add glob and regex thingies

author Mikko Rasa <tdb@tdb.fi>

Fri, 25 May 2007 17:05:52 +0000 (17:05 +0000)

committer Mikko Rasa <tdb@tdb.fi>

Fri, 25 May 2007 17:05:52 +0000 (17:05 +0000)
author Mikko Rasa <tdb@tdb.fi>
Fri, 25 May 2007 17:05:52 +0000 (17:05 +0000)
committer Mikko Rasa <tdb@tdb.fi>
Fri, 25 May 2007 17:05:52 +0000 (17:05 +0000)
diff --git a/source/codec.h b/source/codec.h

index 1f012710f636f0acc208aff0bc87ab3edad8ab5e..b194c3f46b56b4be503ff63e2741f8243f43c3fa 100644 (file)
--- a/source/codec.h
+++ b/source/codec.h
@@ -9,7 +9,7 @@ Distributed under the LGPL
  #define MSP_STRINGS_CODEC_H_
  
  #include <string>
  #define MSP_STRINGS_CODEC_H_
  
  #include <string>
-#include <msp/error.h>
+#include <msp/core/error.h>
  
  namespace Msp {
  
  
  namespace Msp {
  
diff --git a/source/fmt.cpp b/source/fmt.cpp

index 05f9be5a2decaaa558a5ba30c61b806d41eb5639..de0d810b8e91d307d2ba844695ebf405e2d1c905 100644 (file)
--- a/source/fmt.cpp
+++ b/source/fmt.cpp
@@ -4,8 +4,7 @@ This file is part of libmspstrings
  Copyright © 2006-2007 Mikko Rasa
  Distributed under the LGPL
  */
  Copyright © 2006-2007 Mikko Rasa
  Distributed under the LGPL
  */
-
-#include <msp/error.h>
+#include <msp/core/error.h>
  #include "fmt.h"
  
  using namespace std;
  #include "fmt.h"
  
  using namespace std;
diff --git a/source/glob.cpp b/source/glob.cpp

new file mode 100644 (file)

index 0000000..0d77c8d
--- /dev/null
+++ b/source/glob.cpp
@@ -0,0 +1,61 @@
+#include "glob.h"
+
+using namespace std;
+
+namespace {
+
+template<bool icase>
+bool cmp(char c, char h);
+
+template<>
+bool cmp<true>(char c, char h)
+{ return tolower(c)==tolower(h); }
+
+template<>
+bool cmp<false>(char c, char h)
+{ return c==h; }
+
+template<bool icase>
+bool globmatch(string::const_iterator pat_i, const string::const_iterator &pat_e, string::const_iterator str_i, const string::const_iterator &str_e)
+{
+       while(pat_i!=pat_e && str_i!=str_e)
+       {
+               if(*pat_i=='?' || cmp<icase>(*str_i, *pat_i))
+               {
+                       ++pat_i;
+                       ++str_i;
+               }
+               else if(*pat_i=='*')
+               {
+                       ++pat_i;
+                       if(pat_i==pat_e)
+                               return true;
+                       
+                       for(; str_i!=str_e; ++str_i)
+                               if(cmp<icase>(*str_i, *pat_i) && globmatch<icase>(pat_i, pat_e, str_i, str_e))
+                                       return true;
+                       
+                       return false;
+               }
+               else
+                       return false;
+       }
+
+       return pat_i==pat_e && str_i==str_e;
+}
+
+}
+
+namespace Msp {
+
+bool globmatch(const string &pat, const string &str)
+{
+       return ::globmatch<false>(pat.begin(), pat.end(), str.begin(), str.end());
+}
+
+bool globcasematch(const string &pat, const string &str)
+{
+       return ::globmatch<true>(pat.begin(), pat.end(), str.begin(), str.end());
+}
+
+} // namespace Msp
diff --git a/source/glob.h b/source/glob.h

new file mode 100644 (file)

index 0000000..a58d0b1
--- /dev/null
+++ b/source/glob.h
@@ -0,0 +1,13 @@
+#ifndef MSP_STRINGS_GLOB_H_
+#define MSP_STRINGS_GLOB_H_
+
+#include <string>
+
+namespace Msp {
+
+bool globmatch(const std::string &, const std::string &);
+bool globcasematch(const std::string &, const std::string &);
+
+} // namespace Msp
+
+#endif
diff --git a/source/lexicalcast.h b/source/lexicalcast.h

index 776eaab351561a1d0d2b8af704e95bfff9df3337..c73856b8228894374f00b97e2f81dcbcd4532ca2 100644 (file)
--- a/source/lexicalcast.h
+++ b/source/lexicalcast.h
@@ -10,7 +10,7 @@ Distributed under the LGPL
  
  #include <sstream>
  #include <string>
  
  #include <sstream>
  #include <string>
-#include <msp/error.h>
+#include <msp/core/error.h>
  #include "fmt.h"
  
  namespace Msp {
  #include "fmt.h"
  
  namespace Msp {
diff --git a/source/regex.cpp b/source/regex.cpp

new file mode 100644 (file)

index 0000000..839435e
--- /dev/null
+++ b/source/regex.cpp
@@ -0,0 +1,582 @@
+#include <stack>
+#include <msp/core/error.h>
+#include "formatter.h"
+#include "regex.h"
+
+using namespace std;
+
+#include <iostream>
+
+namespace {
+
+/**
+Writes an integer to a Regex code string, in little-endian order.
+*/
+template<typename T>
+void write_int(T n, Msp::Regex::Code &code)
+{
+       for(unsigned i=0; i<sizeof(T); ++i)
+               code+=(n>>i*8)&0xFF;
+}
+
+/**
+Reads an integer from a Regex code stream, in little-endian order.
+*/
+template<typename T>
+T read_int(Msp::Regex::Code::const_iterator &c)
+{
+       T result=0;
+       for(unsigned i=0; i<sizeof(T); ++i)
+               result+=static_cast<unsigned char>(*c++)<<i*8;
+       return result;
+}
+
+}
+
+namespace Msp {
+
+Regex::Regex(const string &expr)
+{
+       n_groups=0;
+       string::const_iterator iter=expr.begin();
+       code=compile(expr, iter, n_groups, false);
+       ++n_groups;
+}
+
+RegMatch Regex::match(const string &str) const
+{
+       RegMatch::GroupArray groups(n_groups);
+
+       for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
+               if(run(str, i, groups))
+                       return RegMatch(str, groups);
+
+       return RegMatch();
+}
+
+string Regex::disassemble() const
+{
+       ostringstream ss;
+
+       for(Code::const_iterator i=code.begin(); i!=code.end();)
+       {
+               Code::const_iterator j=i;
+               Offset offset=i-code.begin();
+               string decompiled=disassemble_instruction(i);
+               string bytes;
+               for(; j!=i; ++j)
+                       bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
+               ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
+               if(bytes.size()>9)
+                       ss<<"\n"<<Fmt("%15s");
+               ss<<"  "<<decompiled<<'\n';
+       }
+
+       return ss.str();
+}
+
+Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
+{
+       bool has_branches=false;
+       unsigned level=0;
+       bool escape=false;
+       string::const_iterator end;
+       for(end=iter; end!=expr.end(); ++end)
+       {
+               if(escape)
+                       escape=false;
+               else if(*end=='\\')
+                       escape=true;
+               else if(*end=='(')
+                       ++level;
+               else if(*end==')')
+               {
+                       if(level==0)
+                       {
+                               if(group==0)
+                                       throw InvalidParameterValue("Unexpected )");
+                               else
+                                       break;
+                       }
+                       --level;
+               }
+               else if(*end=='|')
+               {
+                       if(branch)
+                               break;
+                       else if(level==0)
+                               has_branches=true;
+               }
+       }
+
+       if(level>0)
+               throw InvalidParameterValue("Unmatched (");
+
+       Code result;
+
+       unsigned this_group=group;
+       if(!branch)
+       {
+               result+=GROUP_BEGIN;
+               write_int<Index>(this_group, result);
+       }
+
+       const unsigned jump_size=1+sizeof(Offset);
+
+       if(!has_branches)
+       {
+               for(string::const_iterator i=iter; i!=end;)
+               {
+                       Code atom=parse_atom(expr, i, group);
+
+                       Count repeat_min=1;
+                       Count repeat_max=1;
+                       parse_repeat(i, repeat_min, repeat_max);
+
+                       for(unsigned j=0; j<repeat_min; ++j)
+                               result+=atom;
+                       if(repeat_max==numeric_limits<Count>::max())
+                       {
+                               if(repeat_min==0)
+                               {
+                                       result+=ND_JUMP;
+                                       write_int<Offset>(atom.size()+jump_size, result);
+                                       result+=atom;
+                               }
+                               result+=ND_JUMP;
+                               write_int<Offset>(-(atom.size()+jump_size), result);
+                       }
+                       else if(repeat_max>repeat_min)
+                       {
+                               for(unsigned j=repeat_min; j<repeat_max; ++j)
+                               {
+                                       result+=ND_JUMP;
+                                       write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
+                                       result+=atom;
+                               }
+                       }
+               }
+       }
+       else
+       {
+               list<Code> branches;
+               for(string::const_iterator i=iter;;)
+               {
+                       branches.push_back(compile(expr, i, group, true));
+                       if(i==end)
+                               break;
+                       ++i;
+               }
+
+               unsigned n_branches=branches.size();
+
+               Offset offset=(n_branches-1)*jump_size+branches.front().size();
+               for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
+               {
+                       result+=ND_JUMP;
+                       write_int<Offset>(offset, result);
+                       offset+=i->size();
+               }
+
+               for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
+               {
+                       result+=*i;
+                       offset-=i->size()+jump_size;
+                       ++i;
+                       if(i!=branches.end())
+                       {
+                               result+=JUMP;
+                               write_int<Offset>(offset, result);
+                       }
+               }
+       }
+
+       if(!branch)
+       {
+               result+=GROUP_END;
+               write_int<Index>(this_group, result);
+       }
+
+       iter=end;
+
+       return result;
+}
+
+Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
+{
+       Code result;
+
+       if(i==expr.end())
+               return result;
+
+       bool flag=false;
+       if(*i=='\\')
+       {
+               if(++i==expr.end())
+                       throw InvalidParameterValue("Stray backslash");
+               flag=true;
+       }
+
+       if(!flag)
+       {
+               if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
+                       throw InvalidParameterValue("Invalid atom");
+               else if(*i=='[')
+                       return parse_brackets(expr, i);
+               else if(*i=='.')
+                       result+=MATCH_ANY;
+               else if(*i=='^')
+                       result+=MATCH_BEGIN;
+               else if(*i=='$')
+                       result+=MATCH_END;
+               else if(*i=='(')
+               {
+                       ++group;
+                       result=compile(expr, ++i, group, false);
+               }
+               else
+                       flag=true;
+       }
+
+       if(flag)
+       {
+               if(static_cast<unsigned char>(*i)<=LAST_INSTRUCTION_)
+                       result+=MATCH_CHAR;
+               result+=*i;
+       }
+
+       ++i;
+
+       return result;
+}
+
+bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
+{
+       if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
+               return false;
+
+       if(*i=='*' || *i=='+')
+               rmax=numeric_limits<Count>::max();
+       if(*i=='*' || *i=='?')
+               rmin=0;
+       if(*i=='{')
+       {
+               rmin=0;
+               for(++i; isdigit(*i); ++i)
+                       rmin=rmin*10+(*i-'0');
+
+               if(*i==',')
+               {
+                       ++i;
+                       if(*i!='}')
+                       {
+                               rmax=0;
+                               for(; isdigit(*i); ++i)
+                                       rmax=rmax*10+(*i-'0');
+                               if(rmax<rmin)
+                                       throw InvalidParameterValue("Invalid bound");
+                       }
+                       else
+                               rmax=numeric_limits<Count>::max();
+               }
+               else
+                       rmax=rmin;
+               if(*i!='}')
+                       throw InvalidParameterValue("Invalid bound");
+       }
+
+       ++i;
+
+       return true;
+}
+
+Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
+{
+       Code result;
+
+       ++iter;
+       bool neg=false;
+       if(*iter=='^')
+       {
+               neg=true;
+               ++iter;
+       }
+
+       string::const_iterator end=iter;
+       for(; (end!=str.end() && *end!=']'); ++end);
+       if(end==str.end())
+               throw InvalidParameterValue("Unmatched '['");
+
+       uint8_t mask[32]={0};
+       unsigned type=0;
+       bool range=false;
+       unsigned char first, last;
+       for(string::const_iterator i=iter; i!=end; ++i)
+       {
+               unsigned char c=*i;
+               if(range)
+               {
+                       last=c;
+                       for(unsigned j=first; j<=c; ++j)
+                               mask[j>>3]|=1<<(j&7);
+                       range=false;
+                       if(type<2)
+                               type=2;
+               }
+               else if(c=='-' && i!=iter && end-i>1)
+                       range=true;
+               else
+               {
+                       first=c;
+                       mask[c>>3]|=1<<(c&7);
+                       if(type==0)
+                               type=1;
+                       else
+                               type=3;
+               }
+       }
+
+       if(neg)
+               result+=NEGATE;
+
+       if(type==1)
+       {
+               result+=MATCH_CHAR;
+               result+=first;
+       }
+       else if(type==2)
+       {
+               result+=MATCH_RANGE;
+               result+=first;
+               result+=last;
+       }
+       else
+       {
+               result+=MATCH_MASK;
+               result.append(reinterpret_cast<char *>(mask), 32);
+       }
+
+       iter=end;
+       ++iter;
+
+       return result;
+}
+
+bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
+{
+       bool result=false;
+       list<RunContext> ctx;
+       ctx.push_back(RunContext());
+       ctx.front().citer=code.begin();
+       ctx.front().groups.resize(groups.size());
+
+       for(string::const_iterator i=begin;;)
+       {
+               int c;
+               if(i!=str.end())
+                       c=static_cast<unsigned char>(*i);
+               else
+                       c=-1;
+
+               for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
+               {
+                       bool terminate=false;
+                       bool negate_match=false;
+                       for(; j->citer!=code.end();)
+                       {
+                               Instruction instr=static_cast<Instruction>(*j->citer);
+                               if(instr>LAST_INSTRUCTION_)
+                                       instr=MATCH_CHAR;
+                               else
+                                       ++j->citer;
+
+                               if(instr==NEGATE)
+                                       negate_match=true;
+                               else if(instr==JUMP)
+                               {
+                                       Offset offset=read_int<Offset>(j->citer);
+                                       j->citer+=offset;
+                               }
+                               else if(instr==ND_JUMP)
+                               {
+                                       Offset offset=read_int<Offset>(j->citer);
+                                       ctx.push_back(*j);
+                                       ctx.back().citer+=offset;
+                               }
+                               else if(instr==GROUP_BEGIN)
+                               {
+                                       Index n=read_int<Index>(j->citer);
+                                       if(!j->groups[n].match)
+                                               j->groups[n].begin=i-str.begin();
+                               }
+                               else if(instr==GROUP_END)
+                               {
+                                       Index n=read_int<Index>(j->citer);
+                                       if(!j->groups[n].match)
+                                       {
+                                               j->groups[n].match=true;
+                                               j->groups[n].end=i-str.begin();
+                                               j->groups[n].length=j->groups[n].end-j->groups[n].begin;
+                                       }
+
+                                       if(n==0)
+                                       {
+                                               result=true;
+                                               bool better=false;
+                                               for(unsigned k=0; (k<groups.size() && !better); ++k)
+                                               {
+                                                       better=group_compare(j->groups[k], groups[k]);
+                                                       if(group_compare(groups[k], j->groups[k]))
+                                                               break;
+                                               }
+                                               if(better)
+                                                       groups=j->groups;
+                                       }
+                               }
+                               else
+                               {
+                                       bool match_result=false;
+                                       bool input_consumed=false;
+                                       if(instr==MATCH_BEGIN)
+                                               match_result=(i==str.begin());
+                                       else if(instr==MATCH_END)
+                                               match_result=(i==str.end());
+                                       else if(instr==MATCH_CHAR)
+                                       {
+                                               match_result=(c==*j->citer++);
+                                               input_consumed=true;
+                                       }
+                                       else if(instr==MATCH_RANGE)
+                                       {
+                                               unsigned char first=*j->citer++;
+                                               unsigned char last=*j->citer++;
+                                               match_result=(c>=first && c<=last);
+                                               input_consumed=true;
+                                       }
+                                       else if(instr==MATCH_MASK)
+                                       {
+                                               uint8_t mask[32];
+                                               for(unsigned k=0; k<32; ++k)
+                                                       mask[k]=*j->citer++;
+                                               match_result=mask[c>>3]&(1<<(c&7));
+                                               input_consumed=true;
+                                       }
+                                       else if(instr==MATCH_ANY)
+                                       {
+                                               match_result=true;
+                                               input_consumed=true;
+                                       }
+                                       else
+                                               throw Exception("Invalid instruction");
+
+                                       if(match_result==negate_match)
+                                               terminate=true;
+                                       negate_match=false;
+
+                                       if(input_consumed || terminate)
+                                               break;
+                               }
+                       }
+
+                       if(terminate || j->citer==code.end())
+                               j=ctx.erase(j);
+                       else
+                               ++j;
+               }
+
+               if(i==str.end() || ctx.empty())
+                       break;
+               ++i;
+       }
+
+       return result;
+}
+
+bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
+{
+       if(!g1.match)
+               return false;
+       
+       // Any match is better than no match
+       if(!g2.match)
+               return true;
+
+       // Earlier match is better
+       if(g1.begin<g2.begin)
+               return true;
+       if(g2.begin>g2.begin)
+               return false;
+
+       // Longer match at same position is better
+       return g1.end>g2.end;
+}
+
+string Regex::disassemble_instruction(Code::const_iterator &i) const
+{
+       Instruction instr=static_cast<Instruction>(*i);
+       if(instr>=LAST_INSTRUCTION_)
+               instr=MATCH_CHAR;
+       else
+               ++i;
+
+       ostringstream result;
+       switch(instr)
+       {
+       case JUMP:
+               {
+                       Offset offset=read_int<Offset>(i);
+                       result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+               }
+               break;
+       case ND_JUMP:
+               {
+                       Offset offset=read_int<Offset>(i);
+                       result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
+               }
+               break;
+       case GROUP_BEGIN:
+               result<<"GROUP_BEGIN "<<read_int<Index>(i);
+               break;
+       case GROUP_END:
+               result<<"GROUP_END "<<read_int<Index>(i);
+               break;
+       case NEGATE:
+               result<<"NEGATE";
+               break;
+       case MATCH_BEGIN:
+               result<<"MATCH_BEGIN";
+               break;
+       case MATCH_END:
+               result<<"MATCH_END";
+               break;
+       case MATCH_CHAR:
+               {
+                       char c=*i++;
+                       result<<"MATCH_CHAR ";
+                       if(c>=0x20 && c<=0x7E)
+                               result<<'\''<<c<<'\'';
+                       else
+                               result<<(static_cast<int>(c)&0xFF);
+               }
+               break;
+       case MATCH_RANGE:
+               result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
+               result<<'-'<<(static_cast<int>(*i++)&0xFF);
+               break;
+       case MATCH_MASK:
+               result<<"MATCH_MASK";
+               for(unsigned j=0; j<32; ++j)
+                       result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
+               break;
+       case MATCH_ANY:
+               result<<"MATCH_ANY";
+               break;
+       case FIRST_INSTRUCTION_:
+       case LAST_INSTRUCTION_:
+               result<<"UNKNOWN "<<instr;
+       }
+
+       return result.str();
+}
+
+} // namespace Msp
diff --git a/source/regex.h b/source/regex.h

new file mode 100644 (file)

index 0000000..38cb621
--- /dev/null
+++ b/source/regex.h
@@ -0,0 +1,141 @@
+#ifndef MSP_STRINGS_REGEX_H_
+#define MSP_STRINGS_REGEX_H_
+
+#include <string>
+#include "regmatch.h"
+
+namespace Msp {
+
+/**
+This class provides regular expression matching.  It supports a subset of
+POSIX.2 extended regex syntax.  Character classes, equivalence classes and
+collating elements are not supported.  Refer to the regex(7) manpage for more
+details.
+
+A description of the internal workings of the class follows.  You may skip this
+if you are not a developer or otherwise interested.
+
+The core of this class is a virtual machine running a non-deterministic finite
+automaton (NFA).  The state of the automaton is represented by an iterator into
+the code.  Transitions between states are represented by instructions.
+Instructions may take multiple bytes, so a valid code iterator may not be a
+valid NFA state.
+
+The virtual machine may have any number of execution contexts at any one time.
+On every cycle, each context is advanced until an input-consuming instruction
+is encountered, keeping the input positions in sync with each other.  Execution
+continues as long as there's at least one context remaining.
+
+The GROUP_BEGIN and GROUP_END instructions record the begin and end offset of a
+match group, respectively.  The GROUP_END instruction also marks the group as
+successfully matched.  If the target group was already matched, these
+instructions do nothing.
+
+The JUMP instruction causes the execution iterator to be modified by an offset.
+
+The ND_JUMP instruction causes the execution context to be split in two.  One
+continues directly after the instruction and the other continues at an offset.
+
+The NEGATE instruction causes the result of the next match instruction to be
+negated.
+
+Match instructions compare the input against a condition.  If the match
+succeeds, execution continues at the next instruction.  If the match fails,
+execution of that context is terminated.
+
+The MATCH_BEGIN and MATCH_END instructions match the beginning and end of
+the input string, respectively.  They do not consume input.
+
+The MATCH_CHAR instruction consumes the input character and matches it against
+a single character.  Since regexes often match sequences of printable character,
+a match for such a character may be encoded as the character itself.
+
+The MATCH_RANGE instruction consumes the input character and matches it against
+an inclusive character range.
+
+The MATCH_MASK instruction consumes the input character and matches it against
+a bitmask.
+
+The MATCH_ANY instruction consumes the input character and always succeeds.
+*/
+class Regex
+{
+public:
+       /**
+       Constructs a new Regex object.
+       */
+       Regex(const std::string &expr);
+
+       /**
+       Matches the regex against a string.  Refer to RegMatch documentation for
+       more information on the resulting object.
+       */
+       RegMatch match(const std::string &str) const;
+
+       /**
+       Returns a disassembled representation of the NFA bytecode.  For debugging
+       purposes.
+       */
+       std::string disassemble() const;
+private:
+       typedef std::string Code;
+       typedef unsigned short Count;
+       typedef short Offset;
+       typedef unsigned short Index;
+
+       enum Instruction
+       {
+               FIRST_INSTRUCTION_=0,
+
+               JUMP,
+               ND_JUMP,
+
+               GROUP_BEGIN,
+               GROUP_END,
+
+               NEGATE,
+
+               MATCH_BEGIN,
+               MATCH_END,
+               MATCH_CHAR,
+               MATCH_RANGE,
+               MATCH_MASK,
+               MATCH_ANY,
+
+               LAST_INSTRUCTION_=31
+       };
+
+       struct RunContext
+       {
+               Code::const_iterator citer;
+               RegMatch::GroupArray groups;
+       };
+
+       Code code;
+       unsigned n_groups;
+
+       /**
+       Compiles a regular expression into NFA bytecode.  When compiling a whole
+       regex, \a group should be set to 0.  When the function returns, \a group will
+       be the index of the last subexpression and \a iter will point to the first
+       unused character in the expression.
+
+       \param   expr    Expression to be compiled
+       \param   begin   Iterator into the expression
+       \param   group   Group counter, gets incremented for each subregex
+       \param   branch  Whether we are compiling a branch
+
+       \return  Compiled NFA bytecode
+       */
+       Code compile(const std::string &expr, std::string::const_iterator &iter, unsigned &group, bool branch);
+       Code parse_atom(const std::string &, std::string::const_iterator &i, unsigned &);
+       Code parse_brackets(const std::string &, std::string::const_iterator &);
+       bool parse_repeat(std::string::const_iterator &, Count &, Count &);
+       bool run(const std::string &, const std::string::const_iterator &, RegMatch::GroupArray &) const;
+       bool group_compare(const RegMatch::Group &, const RegMatch::Group &) const;
+       std::string disassemble_instruction(Code::const_iterator &) const;
+};
+
+} // namespace Msp
+
+#endif
diff --git a/source/regmatch.cpp b/source/regmatch.cpp

new file mode 100644 (file)

index 0000000..f08f08a
--- /dev/null
+++ b/source/regmatch.cpp
@@ -0,0 +1,26 @@
+#include <msp/core/error.h>
+#include "regmatch.h"
+
+using namespace std;
+
+namespace Msp {
+
+RegMatch::RegMatch(const string &str, const GroupArray &g):
+       groups(g)
+{
+       for(GroupArray::iterator i=groups.begin(); i!=groups.end(); ++i)
+               if(i->match)
+               {
+                       i->length=i->end-i->begin;
+                       i->str=str.substr(i->begin, i->length);
+               }
+}
+
+const RegMatch::Group &RegMatch::group(unsigned i) const
+{
+       if(i>=groups.size())
+               throw InvalidParameterValue("Group index out of range");
+       return groups[i];
+}
+
+} // namespace Msp
diff --git a/source/regmatch.h b/source/regmatch.h

new file mode 100644 (file)

index 0000000..f5a774e
--- /dev/null
+++ b/source/regmatch.h
@@ -0,0 +1,87 @@
+#ifndef MSP_STRINGS_REGMATCH_H_
+#define MSP_STRINGS_REGMATCH_H_
+
+#include <string>
+#include <vector>
+
+namespace Msp {
+
+/**
+This class stores the result of a Regex being matched against a string.  If the
+match was successful, the RegMatch object evaluates to true, allowing it to be
+used in constructs like \code if(RegMatch match=regex.match("foo")) \endcode.
+
+A RegMatch representing a successful match has one or more groups, indicating
+matching parts of the string.  The first group (with index 0) indicates the
+part matched by the whol regex.  Further groups, if present, indicate parts
+matched by subregexes.  These are ordered from left to right, by the opening
+parenthesis of the subregex.
+*/
+class RegMatch
+{
+public:
+       /**
+       A single subregex of the match.
+       */
+       struct Group
+       {
+               bool match;       //< Whether or not this group matched
+               unsigned begin;   //< First offset of the match
+               unsigned end;     //< One-past-last offset
+               unsigned length;  //< Length of the match (end-begin)
+               std::string str;  //< The part of the string that matched
+
+               Group(): match(false) { }
+       };
+       typedef std::vector<Group> GroupArray;
+
+       /**
+       Constructs a RegMatch representig a non-match.  Used by Regex.
+       */
+       RegMatch() { }
+
+       /**
+       Constructs a new RegMatch from a string and groups.  The length and str members
+       of each group are computed and need not be set.  Used by Regex.
+       */
+       RegMatch(const std::string &, const std::vector<Group> &);
+
+       /**
+       Returns a reference to a single group in the match.  An exception is thrown
+       if the requested group does not exist.
+       */
+       const Group &group(unsigned) const;
+
+       /**
+       Returns true if the RegMatch object represents a non-match.
+       */
+       bool empty() const { return groups.empty(); }
+
+       /**
+       Returns the number of groups in this match.
+       */
+       unsigned size() const { return groups.size(); }
+
+       /**
+       Returns the begin offset of the whole match.
+       */
+       unsigned begin() const { return groups.empty()?0:groups[0].begin; }
+
+       /**
+       Returns the end offset of the whole match.
+       */
+       unsigned end() const { return groups.empty()?0:groups[0].end; }
+
+       /**
+       Shortcut for the group() function.
+       */
+       const Group &operator[](unsigned i) const { return group(i); }
+
+       operator bool() const { return !empty(); }
+private:
+       std::vector<Group> groups;
+};
+
+} // namespace Msp
+
+#endif
author	Mikko Rasa <tdb@tdb.fi>
	Fri, 25 May 2007 17:05:52 +0000 (17:05 +0000)
committer	Mikko Rasa <tdb@tdb.fi>
	Fri, 25 May 2007 17:05:52 +0000 (17:05 +0000)
source/codec.h		patch \| blob \| history
source/fmt.cpp		patch \| blob \| history
source/glob.cpp	[new file with mode: 0644]	patch \| blob
source/glob.h	[new file with mode: 0644]	patch \| blob
source/lexicalcast.h		patch \| blob \| history
source/regex.cpp	[new file with mode: 0644]	patch \| blob
source/regex.h	[new file with mode: 0644]	patch \| blob
source/regmatch.cpp	[new file with mode: 0644]	patch \| blob
source/regmatch.h	[new file with mode: 0644]	patch \| blob