]> git.tdb.fi Git - libs/core.git/commitdiff
Optimize memory accesses in Regex::run
authorMikko Rasa <tdb@tdb.fi>
Fri, 29 Dec 2023 20:36:05 +0000 (22:36 +0200)
committerMikko Rasa <tdb@tdb.fi>
Fri, 29 Dec 2023 22:27:10 +0000 (00:27 +0200)
Use a vector instead of a linked list to store execution contexts, and
store matched groups in contiguous memory as well.

source/strings/regex.cpp
source/strings/regex.h

index 5307c6990286ba373291281fb4ee230ff81d2d07..c61fdfef6a09aff6ad305bbabe3e5d25109e8ab5 100644 (file)
@@ -375,15 +375,16 @@ RegMatch Regex::match(const string &str) const
        return RegMatch();
 }
 
-bool Regex::run(const string &str, const string::const_iterator &begin, vector<RegMatch::Group> &groups) const
+bool Regex::run(const string &str, const string::const_iterator &begin, vector<RegMatch::Group> &out_groups) const
 {
        bool result = false;
-       list<RunContext> ctx;
-       ctx.push_back(RunContext());
-       ctx.front().citer = code.begin();
-       ctx.front().groups.resize(groups.size());
+       vector<RunContext> ctx(1);
+       ctx.front().code_iter = code.begin();
+       vector<RegMatch::Group> groups(out_groups.size());
+       size_t ctx_count = 1;
+       size_t best_groups = 0;
 
-       for(auto i=begin;;)
+       for(auto i=begin;; ++i)
        {
                int c;
                if(i!=str.end())
@@ -391,55 +392,65 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vector<R
                else
                        c = -1;
 
-               for(auto j=ctx.begin(); j!=ctx.end();)
+               for(size_t j=0; j<ctx_count; ++j)
                {
-                       bool terminate = false;
                        bool negate_match = false;
-                       for(; j->citer!=code.end();)
+                       while(ctx[j].code_iter!=code.end())
                        {
-                               Instruction instr = static_cast<Instruction>(*j->citer++);
+                               Instruction instr = static_cast<Instruction>(*ctx[j].code_iter++);
 
                                if(instr==NEGATE)
                                        negate_match = true;
                                else if(instr==JUMP)
                                {
-                                       Offset offset = read_int<Offset>(j->citer);
-                                       j->citer += offset;
+                                       Offset offset = read_int<Offset>(ctx[j].code_iter);
+                                       ctx[j].code_iter += offset;
                                }
                                else if(instr==ND_JUMP)
                                {
-                                       Offset offset = read_int<Offset>(j->citer);
-                                       ctx.push_back(*j);
-                                       ctx.back().citer += offset;
+                                       Offset offset = read_int<Offset>(ctx[j].code_iter);
+                                       if(ctx_count>=ctx.size())
+                                       {
+                                               ctx.emplace_back();
+                                               ctx[ctx_count].groups_index = groups.size();
+                                               groups.resize(groups.size()+out_groups.size());
+                                       }
+                                       ctx[ctx_count].code_iter = ctx[j].code_iter+offset;
+                                       RegMatch::Group *groups_ptr = groups.data()+ctx[j].groups_index;
+                                       copy(groups_ptr, groups_ptr+out_groups.size(), groups.data()+ctx[ctx_count].groups_index);
+                                       ++ctx_count;
                                }
                                else if(instr==GROUP_BEGIN)
                                {
-                                       Index n = read_int<Index>(j->citer);
-                                       if(!j->groups[n].match)
-                                               j->groups[n].begin = i-str.begin();
+                                       RegMatch::Group *groups_ptr = groups.data()+ctx[j].groups_index;
+                                       Index n = read_int<Index>(ctx[j].code_iter);
+                                       if(!groups_ptr[n].match)
+                                               groups_ptr[n].begin = i-str.begin();
                                }
                                else if(instr==GROUP_END)
                                {
-                                       Index n = read_int<Index>(j->citer);
-                                       if(!j->groups[n].match)
+                                       RegMatch::Group *groups_ptr = groups.data()+ctx[j].groups_index;
+                                       Index n = read_int<Index>(ctx[j].code_iter);
+                                       if(!groups_ptr[n].match)
                                        {
-                                               j->groups[n].match = true;
-                                               j->groups[n].end = i-str.begin();
-                                               j->groups[n].length = j->groups[n].end-j->groups[n].begin;
+                                               groups_ptr[n].match = true;
+                                               groups_ptr[n].end = i-str.begin();
+                                               groups_ptr[n].length = groups_ptr[n].end-groups_ptr[n].begin;
                                        }
 
                                        if(n==0)
                                        {
                                                result = true;
                                                bool better = false;
-                                               for(unsigned k=0; (k<groups.size() && !better); ++k)
+                                               const RegMatch::Group *best_ptr = groups.data()+best_groups;
+                                               for(unsigned k=0; (k<out_groups.size() && !better); ++k)
                                                {
-                                                       better = group_compare(j->groups[k], groups[k]);
-                                                       if(group_compare(groups[k], j->groups[k]))
+                                                       better = group_compare(groups_ptr[k], best_ptr[k]);
+                                                       if(group_compare(best_ptr[k], groups_ptr[k]))
                                                                break;
                                                }
                                                if(better)
-                                                       groups = j->groups;
+                                                       best_groups = ctx[j].groups_index;
                                        }
                                }
                                else
@@ -452,13 +463,13 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vector<R
                                                match_result = (i==str.end());
                                        else if(instr==MATCH_CHAR)
                                        {
-                                               match_result = (c==*j->citer++);
+                                               match_result = (c==*ctx[j].code_iter++);
                                                input_consumed = true;
                                        }
                                        else if(instr==MATCH_RANGE)
                                        {
-                                               unsigned char first = *j->citer++;
-                                               unsigned char last = *j->citer++;
+                                               unsigned char first = *ctx[j].code_iter++;
+                                               unsigned char last = *ctx[j].code_iter++;
                                                match_result = (c>=first && c<=last);
                                                input_consumed = true;
                                        }
@@ -466,11 +477,11 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vector<R
                                        {
                                                if(c>=0 && c<=0xFF)
                                                {
-                                                       unsigned char m = *(j->citer+(c>>3));
+                                                       unsigned char m = *(ctx[j].code_iter+(c>>3));
                                                        match_result = m&(1<<(c&7));
                                                }
                                                input_consumed = true;
-                                               j->citer += 32;
+                                               ctx[j].code_iter += 32;
                                        }
                                        else if(instr==MATCH_ANY)
                                        {
@@ -481,25 +492,34 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vector<R
                                                throw internal_error("invalid instruction in regex bytecode");
 
                                        if(match_result==negate_match)
-                                               terminate = true;
+                                               ctx[j].code_iter = code.end();
                                        negate_match = false;
 
-                                       if(input_consumed || terminate)
+                                       if(input_consumed)
                                                break;
                                }
                        }
+               }
 
-                       if(terminate || j->citer==code.end())
-                               j = ctx.erase(j);
+               for(size_t j=0; j<ctx_count; )
+               {
+                       if(ctx[j].code_iter==code.end())
+                       {
+                               if(j!=ctx_count-1)
+                                       swap(ctx[j], ctx[ctx_count-1]);
+                               --ctx_count;
+                       }
                        else
                                ++j;
                }
 
-               if(i==str.end() || ctx.empty())
+               if(i==str.end() || !ctx_count)
                        break;
-               ++i;
        }
 
+       const RegMatch::Group *best_ptr = groups.data()+best_groups;
+       copy(best_ptr, best_ptr+out_groups.size(), out_groups.begin());
+
        return result;
 }
 
index 42c1e01a6951a4c3229c4d5b9eadad18bc119d0d..e0ef08d328f9cc467e95aad4f5d9998374ce6e9e 100644 (file)
@@ -102,8 +102,8 @@ private:
 
        struct RunContext
        {
-               Code::const_iterator citer;
-               std::vector<RegMatch::Group> groups;
+               Code::const_iterator code_iter;
+               std::size_t groups_index = 0;
        };
 
        Code code;