From 5b5776a76b24e26a46d3bacefffa57485cb94abc Mon Sep 17 00:00:00 2001 From: Mikko Rasa Date: Fri, 29 Dec 2023 22:36:05 +0200 Subject: [PATCH] Optimize memory accesses in Regex::run Use a vector instead of a linked list to store execution contexts, and store matched groups in contiguous memory as well. --- source/strings/regex.cpp | 96 ++++++++++++++++++++++++---------------- source/strings/regex.h | 4 +- 2 files changed, 60 insertions(+), 40 deletions(-) diff --git a/source/strings/regex.cpp b/source/strings/regex.cpp index 5307c69..c61fdfe 100644 --- a/source/strings/regex.cpp +++ b/source/strings/regex.cpp @@ -375,15 +375,16 @@ RegMatch Regex::match(const string &str) const return RegMatch(); } -bool Regex::run(const string &str, const string::const_iterator &begin, vector &groups) const +bool Regex::run(const string &str, const string::const_iterator &begin, vector &out_groups) const { bool result = false; - list ctx; - ctx.push_back(RunContext()); - ctx.front().citer = code.begin(); - ctx.front().groups.resize(groups.size()); + vector ctx(1); + ctx.front().code_iter = code.begin(); + vector groups(out_groups.size()); + size_t ctx_count = 1; + size_t best_groups = 0; - for(auto i=begin;;) + for(auto i=begin;; ++i) { int c; if(i!=str.end()) @@ -391,55 +392,65 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vectorciter!=code.end();) + while(ctx[j].code_iter!=code.end()) { - Instruction instr = static_cast(*j->citer++); + Instruction instr = static_cast(*ctx[j].code_iter++); if(instr==NEGATE) negate_match = true; else if(instr==JUMP) { - Offset offset = read_int(j->citer); - j->citer += offset; + Offset offset = read_int(ctx[j].code_iter); + ctx[j].code_iter += offset; } else if(instr==ND_JUMP) { - Offset offset = read_int(j->citer); - ctx.push_back(*j); - ctx.back().citer += offset; + Offset offset = read_int(ctx[j].code_iter); + if(ctx_count>=ctx.size()) + { + ctx.emplace_back(); + ctx[ctx_count].groups_index = groups.size(); + groups.resize(groups.size()+out_groups.size()); + } + ctx[ctx_count].code_iter = ctx[j].code_iter+offset; + RegMatch::Group *groups_ptr = groups.data()+ctx[j].groups_index; + copy(groups_ptr, groups_ptr+out_groups.size(), groups.data()+ctx[ctx_count].groups_index); + ++ctx_count; } else if(instr==GROUP_BEGIN) { - Index n = read_int(j->citer); - if(!j->groups[n].match) - j->groups[n].begin = i-str.begin(); + RegMatch::Group *groups_ptr = groups.data()+ctx[j].groups_index; + Index n = read_int(ctx[j].code_iter); + if(!groups_ptr[n].match) + groups_ptr[n].begin = i-str.begin(); } else if(instr==GROUP_END) { - Index n = read_int(j->citer); - if(!j->groups[n].match) + RegMatch::Group *groups_ptr = groups.data()+ctx[j].groups_index; + Index n = read_int(ctx[j].code_iter); + if(!groups_ptr[n].match) { - j->groups[n].match = true; - j->groups[n].end = i-str.begin(); - j->groups[n].length = j->groups[n].end-j->groups[n].begin; + groups_ptr[n].match = true; + groups_ptr[n].end = i-str.begin(); + groups_ptr[n].length = groups_ptr[n].end-groups_ptr[n].begin; } if(n==0) { result = true; bool better = false; - for(unsigned k=0; (kgroups[k], groups[k]); - if(group_compare(groups[k], j->groups[k])) + better = group_compare(groups_ptr[k], best_ptr[k]); + if(group_compare(best_ptr[k], groups_ptr[k])) break; } if(better) - groups = j->groups; + best_groups = ctx[j].groups_index; } } else @@ -452,13 +463,13 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vectorciter++); + match_result = (c==*ctx[j].code_iter++); input_consumed = true; } else if(instr==MATCH_RANGE) { - unsigned char first = *j->citer++; - unsigned char last = *j->citer++; + unsigned char first = *ctx[j].code_iter++; + unsigned char last = *ctx[j].code_iter++; match_result = (c>=first && c<=last); input_consumed = true; } @@ -466,11 +477,11 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vector=0 && c<=0xFF) { - unsigned char m = *(j->citer+(c>>3)); + unsigned char m = *(ctx[j].code_iter+(c>>3)); match_result = m&(1<<(c&7)); } input_consumed = true; - j->citer += 32; + ctx[j].code_iter += 32; } else if(instr==MATCH_ANY) { @@ -481,25 +492,34 @@ bool Regex::run(const string &str, const string::const_iterator &begin, vectorciter==code.end()) - j = ctx.erase(j); + for(size_t j=0; j groups; + Code::const_iterator code_iter; + std::size_t groups_index = 0; }; Code code; -- 2.45.2