--- /dev/null
+/* $Id$
+
+This file is part of libmspstrings
+Copyright © 2007 Mikko Rasa
+Distributed under the LGPL
+*/
+#ifndef MSP_STRINGS_REGEX_H_
+#define MSP_STRINGS_REGEX_H_
+
+#include <string>
+#include "regmatch.h"
+
+namespace Msp {
+
+/**
+This class provides regular expression matching. It supports a subset of
+POSIX.2 extended regex syntax. Character classes, equivalence classes and
+collating elements are not supported. Refer to the regex(7) manpage for more
+details.
+
+A description of the internal workings of the class follows. You may skip this
+if you are not a developer or otherwise interested.
+
+The core of this class is a virtual machine running a non-deterministic finite
+automaton (NFA). The state of the automaton is represented by an iterator into
+the code. Transitions between states are represented by instructions.
+Instructions may take multiple bytes, so a valid code iterator may not be a
+valid NFA state.
+
+The virtual machine may have any number of execution contexts at any one time.
+On every cycle, each context is advanced until an input-consuming instruction
+is encountered, keeping the input positions in sync with each other. Execution
+continues as long as there's at least one context remaining.
+
+The GROUP_BEGIN and GROUP_END instructions record the begin and end offset of a
+match group, respectively. The GROUP_END instruction also marks the group as
+successfully matched. If the target group was already matched, these
+instructions do nothing.
+
+The JUMP instruction causes the execution iterator to be modified by an offset.
+
+The ND_JUMP instruction causes the execution context to be split in two. One
+continues directly after the instruction and the other continues at an offset.
+
+The NEGATE instruction causes the result of the next match instruction to be
+negated.
+
+Match instructions compare the input against a condition. If the match
+succeeds, execution continues at the next instruction. If the match fails,
+execution of that context is terminated.
+
+The MATCH_BEGIN and MATCH_END instructions match the beginning and end of
+the input string, respectively. They do not consume input.
+
+The MATCH_CHAR instruction consumes the input character and matches it against
+a single character. Since regexes often match sequences of printable character,
+a match for a non-opcode character may be encoded as the character itself.
+
+The MATCH_RANGE instruction consumes the input character and matches it against
+an inclusive character range.
+
+The MATCH_MASK instruction consumes the input character and matches it against
+a bitmask.
+
+The MATCH_ANY instruction consumes the input character and always succeeds.
+*/
+class Regex
+{
+private:
+ typedef std::string Code;
+ typedef unsigned short Count;
+ typedef short Offset;
+ typedef unsigned short Index;
+
+ enum Instruction
+ {
+ FIRST_INSTRUCTION_ = 0,
+
+ JUMP,
+ ND_JUMP,
+
+ GROUP_BEGIN,
+ GROUP_END,
+
+ NEGATE,
+
+ MATCH_BEGIN,
+ MATCH_END,
+ MATCH_CHAR,
+ MATCH_RANGE,
+ MATCH_MASK,
+ MATCH_ANY,
+
+ LAST_INSTRUCTION_ = 31
+ };
+
+ struct RunContext
+ {
+ Code::const_iterator citer;
+ RegMatch::GroupArray groups;
+ };
+
+ Code code;
+ unsigned n_groups;
+
+public:
+ /** Constructs a new Regex object from a string representation. */
+ Regex(const std::string &expr);
+
+private:
+ /** Compiles a regular expression into NFA bytecode. , 2011The iterator will be
+ advanced to the first unused character in the string. */
+ Code compile(const std::string &expr, std::string::const_iterator &iter, unsigned &group, bool branch);
+
+ Code parse_atom(const std::string &, std::string::const_iterator &i, unsigned &);
+ Code parse_brackets(const std::string &, std::string::const_iterator &);
+ bool parse_repeat(std::string::const_iterator &, Count &, Count &);
+
+public:
+ /** Matches the regex against a string. Refer to RegMatch documentation for
+ more information on the resulting object. */
+ RegMatch match(const std::string &str) const;
+
+private:
+ bool run(const std::string &, const std::string::const_iterator &, RegMatch::GroupArray &) const;
+ bool group_compare(const RegMatch::Group &, const RegMatch::Group &) const;
+
+public:
+ /** Returns a disassembled representation of the NFA bytecode. For debugging
+ purposes. */
+ std::string disassemble() const;
+private:
+ std::string disassemble_instruction(Code::const_iterator &) const;
+};
+
+} // namespace Msp
+
+#endif