]> git.tdb.fi Git - libs/core.git/blob - source/regex.cpp
9ba089d09008c7fd1c23a9d700051984eab11362
[libs/core.git] / source / regex.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7
8 #include <stack>
9 #include <limits>
10 #include <msp/core/except.h>
11 #include "formatter.h"
12 #include "regex.h"
13
14 using namespace std;
15
16 namespace {
17
18 /**
19 Writes an integer to a Regex code string, in little-endian order.
20 */
21 template<typename T>
22 void write_int(T n, Msp::Regex::Code &code)
23 {
24         for(unsigned i=0; i<sizeof(T); ++i)
25                 code+=(n>>i*8)&0xFF;
26 }
27
28 /**
29 Reads an integer from a Regex code stream, in little-endian order.
30 */
31 template<typename T>
32 T read_int(Msp::Regex::Code::const_iterator &c)
33 {
34         T result=0;
35         for(unsigned i=0; i<sizeof(T); ++i)
36                 result+=static_cast<unsigned char>(*c++)<<i*8;
37         return result;
38 }
39
40 }
41
42 namespace Msp {
43
44 Regex::Regex(const string &expr)
45 {
46         n_groups=0;
47         string::const_iterator iter=expr.begin();
48         code=compile(expr, iter, n_groups, false);
49         ++n_groups;
50 }
51
52 RegMatch Regex::match(const string &str) const
53 {
54         RegMatch::GroupArray groups(n_groups);
55
56         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
57                 if(run(str, i, groups))
58                         return RegMatch(str, groups);
59
60         return RegMatch();
61 }
62
63 string Regex::disassemble() const
64 {
65         ostringstream ss;
66
67         for(Code::const_iterator i=code.begin(); i!=code.end();)
68         {
69                 Code::const_iterator j=i;
70                 Offset offset=i-code.begin();
71                 string decompiled=disassemble_instruction(i);
72                 string bytes;
73                 for(; j!=i; ++j)
74                         bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
75                 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
76                 if(bytes.size()>9)
77                         ss<<"\n"<<Fmt("%15s");
78                 ss<<"  "<<decompiled<<'\n';
79         }
80
81         return ss.str();
82 }
83
84 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
85 {
86         bool has_branches=false;
87         unsigned level=0;
88         bool escape=false;
89         unsigned bracket=0;
90         string::const_iterator end;
91         for(end=iter; end!=expr.end(); ++end)
92         {
93                 if(escape)
94                         escape=false;
95                 else if(bracket)
96                 {
97                         if(bracket==3 && *end==']')
98                                 bracket=0;
99                         else if(bracket==1 && *end=='^')
100                                 bracket=2;
101                         else
102                                 bracket=3;
103                 }
104                 else if(*end=='\\')
105                         escape=true;
106                 else if(*end=='(')
107                         ++level;
108                 else if(*end==')')
109                 {
110                         if(level==0)
111                         {
112                                 if(group==0)
113                                         throw InvalidParameterValue("Unexpected )");
114                                 else
115                                         break;
116                         }
117                         --level;
118                 }
119                 else if(*end=='|' && level==0)
120                 {
121                         if(branch)
122                                 break;
123                         else
124                                 has_branches=true;
125                 }
126                 else if(*end=='[')
127                         bracket=1;
128         }
129
130         if(level>0)
131                 throw InvalidParameterValue("Unmatched (");
132
133         Code result;
134
135         unsigned this_group=group;
136         if(!branch)
137         {
138                 result+=GROUP_BEGIN;
139                 write_int<Index>(this_group, result);
140         }
141
142         const unsigned jump_size=1+sizeof(Offset);
143
144         if(!has_branches)
145         {
146                 for(string::const_iterator i=iter; i!=end;)
147                 {
148                         Code atom=parse_atom(expr, i, group);
149
150                         Count repeat_min=1;
151                         Count repeat_max=1;
152                         parse_repeat(i, repeat_min, repeat_max);
153
154                         for(unsigned j=0; j<repeat_min; ++j)
155                                 result+=atom;
156                         if(repeat_max==numeric_limits<Count>::max())
157                         {
158                                 if(repeat_min==0)
159                                 {
160                                         result+=ND_JUMP;
161                                         write_int<Offset>(atom.size()+jump_size, result);
162                                         result+=atom;
163                                 }
164                                 result+=ND_JUMP;
165                                 write_int<Offset>(-(atom.size()+jump_size), result);
166                         }
167                         else if(repeat_max>repeat_min)
168                         {
169                                 for(unsigned j=repeat_min; j<repeat_max; ++j)
170                                 {
171                                         result+=ND_JUMP;
172                                         write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
173                                         result+=atom;
174                                 }
175                         }
176                 }
177         }
178         else
179         {
180                 list<Code> branches;
181                 for(string::const_iterator i=iter;;)
182                 {
183                         branches.push_back(compile(expr, i, group, true));
184                         if(i==end)
185                                 break;
186                         ++i;
187                 }
188
189                 unsigned n_branches=branches.size();
190
191                 Offset offset=(n_branches-1)*jump_size+branches.front().size();
192                 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
193                 {
194                         result+=ND_JUMP;
195                         write_int<Offset>(offset, result);
196                         offset+=i->size();
197                 }
198
199                 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
200                 {
201                         result+=*i;
202                         offset-=i->size()+jump_size;
203                         ++i;
204                         if(i!=branches.end())
205                         {
206                                 result+=JUMP;
207                                 write_int<Offset>(offset, result);
208                         }
209                 }
210         }
211
212         if(!branch)
213         {
214                 result+=GROUP_END;
215                 write_int<Index>(this_group, result);
216         }
217
218         iter=end;
219
220         return result;
221 }
222
223 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
224 {
225         Code result;
226
227         if(i==expr.end())
228                 return result;
229
230         bool flag=false;
231         if(*i=='\\')
232         {
233                 if(++i==expr.end())
234                         throw InvalidParameterValue("Stray backslash");
235                 flag=true;
236         }
237
238         if(!flag)
239         {
240                 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
241                         throw InvalidParameterValue("Invalid atom");
242                 else if(*i=='[')
243                         return parse_brackets(expr, i);
244                 else if(*i=='.')
245                         result+=MATCH_ANY;
246                 else if(*i=='^')
247                         result+=MATCH_BEGIN;
248                 else if(*i=='$')
249                         result+=MATCH_END;
250                 else if(*i=='(')
251                 {
252                         ++group;
253                         result=compile(expr, ++i, group, false);
254                 }
255                 else
256                         flag=true;
257         }
258
259         if(flag)
260         {
261                 result+=MATCH_CHAR;
262                 result+=*i;
263         }
264
265         ++i;
266
267         return result;
268 }
269
270 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
271 {
272         if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
273                 return false;
274
275         if(*i=='*' || *i=='+')
276                 rmax=numeric_limits<Count>::max();
277         if(*i=='*' || *i=='?')
278                 rmin=0;
279         if(*i=='{')
280         {
281                 rmin=0;
282                 for(++i; isdigit(*i); ++i)
283                         rmin=rmin*10+(*i-'0');
284
285                 if(*i==',')
286                 {
287                         ++i;
288                         if(*i!='}')
289                         {
290                                 rmax=0;
291                                 for(; isdigit(*i); ++i)
292                                         rmax=rmax*10+(*i-'0');
293                                 if(rmax<rmin)
294                                         throw InvalidParameterValue("Invalid bound");
295                         }
296                         else
297                                 rmax=numeric_limits<Count>::max();
298                 }
299                 else
300                         rmax=rmin;
301                 if(*i!='}')
302                         throw InvalidParameterValue("Invalid bound");
303         }
304
305         ++i;
306
307         return true;
308 }
309
310 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
311 {
312         Code result;
313
314         ++iter;
315         bool neg=false;
316         if(*iter=='^')
317         {
318                 neg=true;
319                 ++iter;
320         }
321
322         string::const_iterator end=iter;
323         for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
324         if(end==str.end())
325                 throw InvalidParameterValue("Unmatched '['");
326
327         unsigned char mask[32]={0};
328         unsigned type=0;
329         bool range=false;
330         unsigned char first=0, last=0;
331         for(string::const_iterator i=iter; i!=end; ++i)
332         {
333                 unsigned char c=*i;
334                 if(range)
335                 {
336                         last=c;
337                         for(unsigned j=first; j<=c; ++j)
338                                 mask[j>>3]|=1<<(j&7);
339                         range=false;
340                         if(type<2)
341                                 type=2;
342                 }
343                 else if(c=='-' && i!=iter && end-i>1)
344                         range=true;
345                 else
346                 {
347                         first=c;
348                         mask[c>>3]|=1<<(c&7);
349                         if(type==0)
350                                 type=1;
351                         else
352                                 type=3;
353                 }
354         }
355
356         if(neg)
357                 result+=NEGATE;
358
359         if(type==1)
360         {
361                 result+=MATCH_CHAR;
362                 result+=first;
363         }
364         else if(type==2)
365         {
366                 result+=MATCH_RANGE;
367                 result+=first;
368                 result+=last;
369         }
370         else
371         {
372                 result+=MATCH_MASK;
373                 result.append(reinterpret_cast<char *>(mask), 32);
374         }
375
376         iter=end;
377         ++iter;
378
379         return result;
380 }
381
382 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
383 {
384         bool result=false;
385         list<RunContext> ctx;
386         ctx.push_back(RunContext());
387         ctx.front().citer=code.begin();
388         ctx.front().groups.resize(groups.size());
389
390         for(string::const_iterator i=begin;;)
391         {
392                 int c;
393                 if(i!=str.end())
394                         c=static_cast<unsigned char>(*i);
395                 else
396                         c=-1;
397
398                 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
399                 {
400                         bool terminate=false;
401                         bool negate_match=false;
402                         for(; j->citer!=code.end();)
403                         {
404                                 Instruction instr=static_cast<Instruction>(*j->citer++);
405
406                                 if(instr==NEGATE)
407                                         negate_match=true;
408                                 else if(instr==JUMP)
409                                 {
410                                         Offset offset=read_int<Offset>(j->citer);
411                                         j->citer+=offset;
412                                 }
413                                 else if(instr==ND_JUMP)
414                                 {
415                                         Offset offset=read_int<Offset>(j->citer);
416                                         ctx.push_back(*j);
417                                         ctx.back().citer+=offset;
418                                 }
419                                 else if(instr==GROUP_BEGIN)
420                                 {
421                                         Index n=read_int<Index>(j->citer);
422                                         if(!j->groups[n].match)
423                                                 j->groups[n].begin=i-str.begin();
424                                 }
425                                 else if(instr==GROUP_END)
426                                 {
427                                         Index n=read_int<Index>(j->citer);
428                                         if(!j->groups[n].match)
429                                         {
430                                                 j->groups[n].match=true;
431                                                 j->groups[n].end=i-str.begin();
432                                                 j->groups[n].length=j->groups[n].end-j->groups[n].begin;
433                                         }
434
435                                         if(n==0)
436                                         {
437                                                 result=true;
438                                                 bool better=false;
439                                                 for(unsigned k=0; (k<groups.size() && !better); ++k)
440                                                 {
441                                                         better=group_compare(j->groups[k], groups[k]);
442                                                         if(group_compare(groups[k], j->groups[k]))
443                                                                 break;
444                                                 }
445                                                 if(better)
446                                                         groups=j->groups;
447                                         }
448                                 }
449                                 else
450                                 {
451                                         bool match_result=false;
452                                         bool input_consumed=false;
453                                         if(instr==MATCH_BEGIN)
454                                                 match_result=(i==str.begin());
455                                         else if(instr==MATCH_END)
456                                                 match_result=(i==str.end());
457                                         else if(instr==MATCH_CHAR)
458                                         {
459                                                 match_result=(c==*j->citer++);
460                                                 input_consumed=true;
461                                         }
462                                         else if(instr==MATCH_RANGE)
463                                         {
464                                                 unsigned char first=*j->citer++;
465                                                 unsigned char last=*j->citer++;
466                                                 match_result=(c>=first && c<=last);
467                                                 input_consumed=true;
468                                         }
469                                         else if(instr==MATCH_MASK)
470                                         {
471                                                 if(c>=0 && c<=0xFF)
472                                                 {
473                                                         unsigned char m=*(j->citer+(c>>3));
474                                                         match_result=m&(1<<(c&7));
475                                                         input_consumed=true;
476                                                         j->citer+=32;
477                                                 }
478                                         }
479                                         else if(instr==MATCH_ANY)
480                                         {
481                                                 match_result=true;
482                                                 input_consumed=true;
483                                         }
484                                         else
485                                                 throw Exception("Invalid instruction");
486
487                                         if(match_result==negate_match)
488                                                 terminate=true;
489                                         negate_match=false;
490
491                                         if(input_consumed || terminate)
492                                                 break;
493                                 }
494                         }
495
496                         if(terminate || j->citer==code.end())
497                                 j=ctx.erase(j);
498                         else
499                                 ++j;
500                 }
501
502                 if(i==str.end() || ctx.empty())
503                         break;
504                 ++i;
505         }
506
507         return result;
508 }
509
510 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
511 {
512         if(!g1.match)
513                 return false;
514
515         // Any match is better than no match
516         if(!g2.match)
517                 return true;
518
519         // Earlier match is better
520         if(g1.begin<g2.begin)
521                 return true;
522         if(g2.begin>g2.begin)
523                 return false;
524
525         // Longer match at same position is better
526         return g1.end>g2.end;
527 }
528
529 string Regex::disassemble_instruction(Code::const_iterator &i) const
530 {
531         Instruction instr=static_cast<Instruction>(*i++);
532
533         ostringstream result;
534         switch(instr)
535         {
536         case JUMP:
537                 {
538                         Offset offset=read_int<Offset>(i);
539                         result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
540                 }
541                 break;
542         case ND_JUMP:
543                 {
544                         Offset offset=read_int<Offset>(i);
545                         result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
546                 }
547                 break;
548         case GROUP_BEGIN:
549                 result<<"GROUP_BEGIN "<<read_int<Index>(i);
550                 break;
551         case GROUP_END:
552                 result<<"GROUP_END "<<read_int<Index>(i);
553                 break;
554         case NEGATE:
555                 result<<"NEGATE";
556                 break;
557         case MATCH_BEGIN:
558                 result<<"MATCH_BEGIN";
559                 break;
560         case MATCH_END:
561                 result<<"MATCH_END";
562                 break;
563         case MATCH_CHAR:
564                 {
565                         char c=*i++;
566                         result<<"MATCH_CHAR ";
567                         if(c>=0x20 && c<=0x7E)
568                                 result<<'\''<<c<<'\'';
569                         else
570                                 result<<(static_cast<int>(c)&0xFF);
571                 }
572                 break;
573         case MATCH_RANGE:
574                 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
575                 result<<'-'<<(static_cast<int>(*i++)&0xFF);
576                 break;
577         case MATCH_MASK:
578                 result<<"MATCH_MASK";
579                 for(unsigned j=0; j<32; ++j)
580                         result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
581                 break;
582         case MATCH_ANY:
583                 result<<"MATCH_ANY";
584                 break;
585         default:
586                 result<<"UNKNOWN "<<instr;
587         }
588
589         return result.str();
590 }
591
592 } // namespace Msp