]> git.tdb.fi Git - libs/core.git/blob - source/regex.cpp
839435ef3bf62fd9c95134efd005e97d307dc011
[libs/core.git] / source / regex.cpp
1 #include <stack>
2 #include <msp/core/error.h>
3 #include "formatter.h"
4 #include "regex.h"
5
6 using namespace std;
7
8 #include <iostream>
9
10 namespace {
11
12 /**
13 Writes an integer to a Regex code string, in little-endian order.
14 */
15 template<typename T>
16 void write_int(T n, Msp::Regex::Code &code)
17 {
18         for(unsigned i=0; i<sizeof(T); ++i)
19                 code+=(n>>i*8)&0xFF;
20 }
21
22 /**
23 Reads an integer from a Regex code stream, in little-endian order.
24 */
25 template<typename T>
26 T read_int(Msp::Regex::Code::const_iterator &c)
27 {
28         T result=0;
29         for(unsigned i=0; i<sizeof(T); ++i)
30                 result+=static_cast<unsigned char>(*c++)<<i*8;
31         return result;
32 }
33
34 }
35
36 namespace Msp {
37
38 Regex::Regex(const string &expr)
39 {
40         n_groups=0;
41         string::const_iterator iter=expr.begin();
42         code=compile(expr, iter, n_groups, false);
43         ++n_groups;
44 }
45
46 RegMatch Regex::match(const string &str) const
47 {
48         RegMatch::GroupArray groups(n_groups);
49
50         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
51                 if(run(str, i, groups))
52                         return RegMatch(str, groups);
53
54         return RegMatch();
55 }
56
57 string Regex::disassemble() const
58 {
59         ostringstream ss;
60
61         for(Code::const_iterator i=code.begin(); i!=code.end();)
62         {
63                 Code::const_iterator j=i;
64                 Offset offset=i-code.begin();
65                 string decompiled=disassemble_instruction(i);
66                 string bytes;
67                 for(; j!=i; ++j)
68                         bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
69                 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
70                 if(bytes.size()>9)
71                         ss<<"\n"<<Fmt("%15s");
72                 ss<<"  "<<decompiled<<'\n';
73         }
74
75         return ss.str();
76 }
77
78 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
79 {
80         bool has_branches=false;
81         unsigned level=0;
82         bool escape=false;
83         string::const_iterator end;
84         for(end=iter; end!=expr.end(); ++end)
85         {
86                 if(escape)
87                         escape=false;
88                 else if(*end=='\\')
89                         escape=true;
90                 else if(*end=='(')
91                         ++level;
92                 else if(*end==')')
93                 {
94                         if(level==0)
95                         {
96                                 if(group==0)
97                                         throw InvalidParameterValue("Unexpected )");
98                                 else
99                                         break;
100                         }
101                         --level;
102                 }
103                 else if(*end=='|')
104                 {
105                         if(branch)
106                                 break;
107                         else if(level==0)
108                                 has_branches=true;
109                 }
110         }
111
112         if(level>0)
113                 throw InvalidParameterValue("Unmatched (");
114
115         Code result;
116
117         unsigned this_group=group;
118         if(!branch)
119         {
120                 result+=GROUP_BEGIN;
121                 write_int<Index>(this_group, result);
122         }
123
124         const unsigned jump_size=1+sizeof(Offset);
125
126         if(!has_branches)
127         {
128                 for(string::const_iterator i=iter; i!=end;)
129                 {
130                         Code atom=parse_atom(expr, i, group);
131
132                         Count repeat_min=1;
133                         Count repeat_max=1;
134                         parse_repeat(i, repeat_min, repeat_max);
135
136                         for(unsigned j=0; j<repeat_min; ++j)
137                                 result+=atom;
138                         if(repeat_max==numeric_limits<Count>::max())
139                         {
140                                 if(repeat_min==0)
141                                 {
142                                         result+=ND_JUMP;
143                                         write_int<Offset>(atom.size()+jump_size, result);
144                                         result+=atom;
145                                 }
146                                 result+=ND_JUMP;
147                                 write_int<Offset>(-(atom.size()+jump_size), result);
148                         }
149                         else if(repeat_max>repeat_min)
150                         {
151                                 for(unsigned j=repeat_min; j<repeat_max; ++j)
152                                 {
153                                         result+=ND_JUMP;
154                                         write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
155                                         result+=atom;
156                                 }
157                         }
158                 }
159         }
160         else
161         {
162                 list<Code> branches;
163                 for(string::const_iterator i=iter;;)
164                 {
165                         branches.push_back(compile(expr, i, group, true));
166                         if(i==end)
167                                 break;
168                         ++i;
169                 }
170
171                 unsigned n_branches=branches.size();
172
173                 Offset offset=(n_branches-1)*jump_size+branches.front().size();
174                 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
175                 {
176                         result+=ND_JUMP;
177                         write_int<Offset>(offset, result);
178                         offset+=i->size();
179                 }
180
181                 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
182                 {
183                         result+=*i;
184                         offset-=i->size()+jump_size;
185                         ++i;
186                         if(i!=branches.end())
187                         {
188                                 result+=JUMP;
189                                 write_int<Offset>(offset, result);
190                         }
191                 }
192         }
193
194         if(!branch)
195         {
196                 result+=GROUP_END;
197                 write_int<Index>(this_group, result);
198         }
199
200         iter=end;
201
202         return result;
203 }
204
205 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
206 {
207         Code result;
208
209         if(i==expr.end())
210                 return result;
211
212         bool flag=false;
213         if(*i=='\\')
214         {
215                 if(++i==expr.end())
216                         throw InvalidParameterValue("Stray backslash");
217                 flag=true;
218         }
219
220         if(!flag)
221         {
222                 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
223                         throw InvalidParameterValue("Invalid atom");
224                 else if(*i=='[')
225                         return parse_brackets(expr, i);
226                 else if(*i=='.')
227                         result+=MATCH_ANY;
228                 else if(*i=='^')
229                         result+=MATCH_BEGIN;
230                 else if(*i=='$')
231                         result+=MATCH_END;
232                 else if(*i=='(')
233                 {
234                         ++group;
235                         result=compile(expr, ++i, group, false);
236                 }
237                 else
238                         flag=true;
239         }
240
241         if(flag)
242         {
243                 if(static_cast<unsigned char>(*i)<=LAST_INSTRUCTION_)
244                         result+=MATCH_CHAR;
245                 result+=*i;
246         }
247
248         ++i;
249
250         return result;
251 }
252
253 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
254 {
255         if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
256                 return false;
257
258         if(*i=='*' || *i=='+')
259                 rmax=numeric_limits<Count>::max();
260         if(*i=='*' || *i=='?')
261                 rmin=0;
262         if(*i=='{')
263         {
264                 rmin=0;
265                 for(++i; isdigit(*i); ++i)
266                         rmin=rmin*10+(*i-'0');
267
268                 if(*i==',')
269                 {
270                         ++i;
271                         if(*i!='}')
272                         {
273                                 rmax=0;
274                                 for(; isdigit(*i); ++i)
275                                         rmax=rmax*10+(*i-'0');
276                                 if(rmax<rmin)
277                                         throw InvalidParameterValue("Invalid bound");
278                         }
279                         else
280                                 rmax=numeric_limits<Count>::max();
281                 }
282                 else
283                         rmax=rmin;
284                 if(*i!='}')
285                         throw InvalidParameterValue("Invalid bound");
286         }
287
288         ++i;
289
290         return true;
291 }
292
293 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
294 {
295         Code result;
296
297         ++iter;
298         bool neg=false;
299         if(*iter=='^')
300         {
301                 neg=true;
302                 ++iter;
303         }
304
305         string::const_iterator end=iter;
306         for(; (end!=str.end() && *end!=']'); ++end);
307         if(end==str.end())
308                 throw InvalidParameterValue("Unmatched '['");
309
310         uint8_t mask[32]={0};
311         unsigned type=0;
312         bool range=false;
313         unsigned char first, last;
314         for(string::const_iterator i=iter; i!=end; ++i)
315         {
316                 unsigned char c=*i;
317                 if(range)
318                 {
319                         last=c;
320                         for(unsigned j=first; j<=c; ++j)
321                                 mask[j>>3]|=1<<(j&7);
322                         range=false;
323                         if(type<2)
324                                 type=2;
325                 }
326                 else if(c=='-' && i!=iter && end-i>1)
327                         range=true;
328                 else
329                 {
330                         first=c;
331                         mask[c>>3]|=1<<(c&7);
332                         if(type==0)
333                                 type=1;
334                         else
335                                 type=3;
336                 }
337         }
338
339         if(neg)
340                 result+=NEGATE;
341
342         if(type==1)
343         {
344                 result+=MATCH_CHAR;
345                 result+=first;
346         }
347         else if(type==2)
348         {
349                 result+=MATCH_RANGE;
350                 result+=first;
351                 result+=last;
352         }
353         else
354         {
355                 result+=MATCH_MASK;
356                 result.append(reinterpret_cast<char *>(mask), 32);
357         }
358
359         iter=end;
360         ++iter;
361
362         return result;
363 }
364
365 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
366 {
367         bool result=false;
368         list<RunContext> ctx;
369         ctx.push_back(RunContext());
370         ctx.front().citer=code.begin();
371         ctx.front().groups.resize(groups.size());
372
373         for(string::const_iterator i=begin;;)
374         {
375                 int c;
376                 if(i!=str.end())
377                         c=static_cast<unsigned char>(*i);
378                 else
379                         c=-1;
380
381                 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
382                 {
383                         bool terminate=false;
384                         bool negate_match=false;
385                         for(; j->citer!=code.end();)
386                         {
387                                 Instruction instr=static_cast<Instruction>(*j->citer);
388                                 if(instr>LAST_INSTRUCTION_)
389                                         instr=MATCH_CHAR;
390                                 else
391                                         ++j->citer;
392
393                                 if(instr==NEGATE)
394                                         negate_match=true;
395                                 else if(instr==JUMP)
396                                 {
397                                         Offset offset=read_int<Offset>(j->citer);
398                                         j->citer+=offset;
399                                 }
400                                 else if(instr==ND_JUMP)
401                                 {
402                                         Offset offset=read_int<Offset>(j->citer);
403                                         ctx.push_back(*j);
404                                         ctx.back().citer+=offset;
405                                 }
406                                 else if(instr==GROUP_BEGIN)
407                                 {
408                                         Index n=read_int<Index>(j->citer);
409                                         if(!j->groups[n].match)
410                                                 j->groups[n].begin=i-str.begin();
411                                 }
412                                 else if(instr==GROUP_END)
413                                 {
414                                         Index n=read_int<Index>(j->citer);
415                                         if(!j->groups[n].match)
416                                         {
417                                                 j->groups[n].match=true;
418                                                 j->groups[n].end=i-str.begin();
419                                                 j->groups[n].length=j->groups[n].end-j->groups[n].begin;
420                                         }
421
422                                         if(n==0)
423                                         {
424                                                 result=true;
425                                                 bool better=false;
426                                                 for(unsigned k=0; (k<groups.size() && !better); ++k)
427                                                 {
428                                                         better=group_compare(j->groups[k], groups[k]);
429                                                         if(group_compare(groups[k], j->groups[k]))
430                                                                 break;
431                                                 }
432                                                 if(better)
433                                                         groups=j->groups;
434                                         }
435                                 }
436                                 else
437                                 {
438                                         bool match_result=false;
439                                         bool input_consumed=false;
440                                         if(instr==MATCH_BEGIN)
441                                                 match_result=(i==str.begin());
442                                         else if(instr==MATCH_END)
443                                                 match_result=(i==str.end());
444                                         else if(instr==MATCH_CHAR)
445                                         {
446                                                 match_result=(c==*j->citer++);
447                                                 input_consumed=true;
448                                         }
449                                         else if(instr==MATCH_RANGE)
450                                         {
451                                                 unsigned char first=*j->citer++;
452                                                 unsigned char last=*j->citer++;
453                                                 match_result=(c>=first && c<=last);
454                                                 input_consumed=true;
455                                         }
456                                         else if(instr==MATCH_MASK)
457                                         {
458                                                 uint8_t mask[32];
459                                                 for(unsigned k=0; k<32; ++k)
460                                                         mask[k]=*j->citer++;
461                                                 match_result=mask[c>>3]&(1<<(c&7));
462                                                 input_consumed=true;
463                                         }
464                                         else if(instr==MATCH_ANY)
465                                         {
466                                                 match_result=true;
467                                                 input_consumed=true;
468                                         }
469                                         else
470                                                 throw Exception("Invalid instruction");
471
472                                         if(match_result==negate_match)
473                                                 terminate=true;
474                                         negate_match=false;
475
476                                         if(input_consumed || terminate)
477                                                 break;
478                                 }
479                         }
480
481                         if(terminate || j->citer==code.end())
482                                 j=ctx.erase(j);
483                         else
484                                 ++j;
485                 }
486
487                 if(i==str.end() || ctx.empty())
488                         break;
489                 ++i;
490         }
491
492         return result;
493 }
494
495 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
496 {
497         if(!g1.match)
498                 return false;
499         
500         // Any match is better than no match
501         if(!g2.match)
502                 return true;
503
504         // Earlier match is better
505         if(g1.begin<g2.begin)
506                 return true;
507         if(g2.begin>g2.begin)
508                 return false;
509
510         // Longer match at same position is better
511         return g1.end>g2.end;
512 }
513
514 string Regex::disassemble_instruction(Code::const_iterator &i) const
515 {
516         Instruction instr=static_cast<Instruction>(*i);
517         if(instr>=LAST_INSTRUCTION_)
518                 instr=MATCH_CHAR;
519         else
520                 ++i;
521
522         ostringstream result;
523         switch(instr)
524         {
525         case JUMP:
526                 {
527                         Offset offset=read_int<Offset>(i);
528                         result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
529                 }
530                 break;
531         case ND_JUMP:
532                 {
533                         Offset offset=read_int<Offset>(i);
534                         result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
535                 }
536                 break;
537         case GROUP_BEGIN:
538                 result<<"GROUP_BEGIN "<<read_int<Index>(i);
539                 break;
540         case GROUP_END:
541                 result<<"GROUP_END "<<read_int<Index>(i);
542                 break;
543         case NEGATE:
544                 result<<"NEGATE";
545                 break;
546         case MATCH_BEGIN:
547                 result<<"MATCH_BEGIN";
548                 break;
549         case MATCH_END:
550                 result<<"MATCH_END";
551                 break;
552         case MATCH_CHAR:
553                 {
554                         char c=*i++;
555                         result<<"MATCH_CHAR ";
556                         if(c>=0x20 && c<=0x7E)
557                                 result<<'\''<<c<<'\'';
558                         else
559                                 result<<(static_cast<int>(c)&0xFF);
560                 }
561                 break;
562         case MATCH_RANGE:
563                 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
564                 result<<'-'<<(static_cast<int>(*i++)&0xFF);
565                 break;
566         case MATCH_MASK:
567                 result<<"MATCH_MASK";
568                 for(unsigned j=0; j<32; ++j)
569                         result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
570                 break;
571         case MATCH_ANY:
572                 result<<"MATCH_ANY";
573                 break;
574         case FIRST_INSTRUCTION_:
575         case LAST_INSTRUCTION_:
576                 result<<"UNKNOWN "<<instr;
577         }
578
579         return result.str();
580 }
581
582 } // namespace Msp