]> git.tdb.fi Git - libs/core.git/blob - source/regex.cpp
a1babf46d019b77a1db433a92ed447a0fbd2d6a2
[libs/core.git] / source / regex.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7
8 #include <stack>
9 #include <limits>
10 #include <msp/core/except.h>
11 #include "formatter.h"
12 #include "regex.h"
13
14 using namespace std;
15
16 namespace {
17
18 /**
19 Writes an integer to a Regex code string, in little-endian order.
20 */
21 template<typename T>
22 void write_int(T n, Msp::Regex::Code &code)
23 {
24         for(unsigned i=0; i<sizeof(T); ++i)
25                 code+=(n>>i*8)&0xFF;
26 }
27
28 /**
29 Reads an integer from a Regex code stream, in little-endian order.
30 */
31 template<typename T>
32 T read_int(Msp::Regex::Code::const_iterator &c)
33 {
34         T result=0;
35         for(unsigned i=0; i<sizeof(T); ++i)
36                 result+=static_cast<unsigned char>(*c++)<<i*8;
37         return result;
38 }
39
40 }
41
42 namespace Msp {
43
44 Regex::Regex(const string &expr)
45 {
46         n_groups=0;
47         string::const_iterator iter=expr.begin();
48         code=compile(expr, iter, n_groups, false);
49         ++n_groups;
50 }
51
52 RegMatch Regex::match(const string &str) const
53 {
54         RegMatch::GroupArray groups(n_groups);
55
56         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
57                 if(run(str, i, groups))
58                         return RegMatch(str, groups);
59
60         return RegMatch();
61 }
62
63 string Regex::disassemble() const
64 {
65         ostringstream ss;
66
67         for(Code::const_iterator i=code.begin(); i!=code.end();)
68         {
69                 Code::const_iterator j=i;
70                 Offset offset=i-code.begin();
71                 string decompiled=disassemble_instruction(i);
72                 string bytes;
73                 for(; j!=i; ++j)
74                         bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
75                 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
76                 if(bytes.size()>9)
77                         ss<<"\n"<<Fmt("%15s");
78                 ss<<"  "<<decompiled<<'\n';
79         }
80
81         return ss.str();
82 }
83
84 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
85 {
86         bool has_branches=false;
87         unsigned level=0;
88         bool escape=false;
89         string::const_iterator end;
90         for(end=iter; end!=expr.end(); ++end)
91         {
92                 if(escape)
93                         escape=false;
94                 else if(*end=='\\')
95                         escape=true;
96                 else if(*end=='(')
97                         ++level;
98                 else if(*end==')')
99                 {
100                         if(level==0)
101                         {
102                                 if(group==0)
103                                         throw InvalidParameterValue("Unexpected )");
104                                 else
105                                         break;
106                         }
107                         --level;
108                 }
109                 else if(*end=='|')
110                 {
111                         if(branch)
112                                 break;
113                         else if(level==0)
114                                 has_branches=true;
115                 }
116         }
117
118         if(level>0)
119                 throw InvalidParameterValue("Unmatched (");
120
121         Code result;
122
123         unsigned this_group=group;
124         if(!branch)
125         {
126                 result+=GROUP_BEGIN;
127                 write_int<Index>(this_group, result);
128         }
129
130         const unsigned jump_size=1+sizeof(Offset);
131
132         if(!has_branches)
133         {
134                 for(string::const_iterator i=iter; i!=end;)
135                 {
136                         Code atom=parse_atom(expr, i, group);
137
138                         Count repeat_min=1;
139                         Count repeat_max=1;
140                         parse_repeat(i, repeat_min, repeat_max);
141
142                         for(unsigned j=0; j<repeat_min; ++j)
143                                 result+=atom;
144                         if(repeat_max==numeric_limits<Count>::max())
145                         {
146                                 if(repeat_min==0)
147                                 {
148                                         result+=ND_JUMP;
149                                         write_int<Offset>(atom.size()+jump_size, result);
150                                         result+=atom;
151                                 }
152                                 result+=ND_JUMP;
153                                 write_int<Offset>(-(atom.size()+jump_size), result);
154                         }
155                         else if(repeat_max>repeat_min)
156                         {
157                                 for(unsigned j=repeat_min; j<repeat_max; ++j)
158                                 {
159                                         result+=ND_JUMP;
160                                         write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
161                                         result+=atom;
162                                 }
163                         }
164                 }
165         }
166         else
167         {
168                 list<Code> branches;
169                 for(string::const_iterator i=iter;;)
170                 {
171                         branches.push_back(compile(expr, i, group, true));
172                         if(i==end)
173                                 break;
174                         ++i;
175                 }
176
177                 unsigned n_branches=branches.size();
178
179                 Offset offset=(n_branches-1)*jump_size+branches.front().size();
180                 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
181                 {
182                         result+=ND_JUMP;
183                         write_int<Offset>(offset, result);
184                         offset+=i->size();
185                 }
186
187                 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
188                 {
189                         result+=*i;
190                         offset-=i->size()+jump_size;
191                         ++i;
192                         if(i!=branches.end())
193                         {
194                                 result+=JUMP;
195                                 write_int<Offset>(offset, result);
196                         }
197                 }
198         }
199
200         if(!branch)
201         {
202                 result+=GROUP_END;
203                 write_int<Index>(this_group, result);
204         }
205
206         iter=end;
207
208         return result;
209 }
210
211 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
212 {
213         Code result;
214
215         if(i==expr.end())
216                 return result;
217
218         bool flag=false;
219         if(*i=='\\')
220         {
221                 if(++i==expr.end())
222                         throw InvalidParameterValue("Stray backslash");
223                 flag=true;
224         }
225
226         if(!flag)
227         {
228                 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
229                         throw InvalidParameterValue("Invalid atom");
230                 else if(*i=='[')
231                         return parse_brackets(expr, i);
232                 else if(*i=='.')
233                         result+=MATCH_ANY;
234                 else if(*i=='^')
235                         result+=MATCH_BEGIN;
236                 else if(*i=='$')
237                         result+=MATCH_END;
238                 else if(*i=='(')
239                 {
240                         ++group;
241                         result=compile(expr, ++i, group, false);
242                 }
243                 else
244                         flag=true;
245         }
246
247         if(flag)
248         {
249                 result+=MATCH_CHAR;
250                 result+=*i;
251         }
252
253         ++i;
254
255         return result;
256 }
257
258 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
259 {
260         if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
261                 return false;
262
263         if(*i=='*' || *i=='+')
264                 rmax=numeric_limits<Count>::max();
265         if(*i=='*' || *i=='?')
266                 rmin=0;
267         if(*i=='{')
268         {
269                 rmin=0;
270                 for(++i; isdigit(*i); ++i)
271                         rmin=rmin*10+(*i-'0');
272
273                 if(*i==',')
274                 {
275                         ++i;
276                         if(*i!='}')
277                         {
278                                 rmax=0;
279                                 for(; isdigit(*i); ++i)
280                                         rmax=rmax*10+(*i-'0');
281                                 if(rmax<rmin)
282                                         throw InvalidParameterValue("Invalid bound");
283                         }
284                         else
285                                 rmax=numeric_limits<Count>::max();
286                 }
287                 else
288                         rmax=rmin;
289                 if(*i!='}')
290                         throw InvalidParameterValue("Invalid bound");
291         }
292
293         ++i;
294
295         return true;
296 }
297
298 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
299 {
300         Code result;
301
302         ++iter;
303         bool neg=false;
304         if(*iter=='^')
305         {
306                 neg=true;
307                 ++iter;
308         }
309
310         string::const_iterator end=iter;
311         for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
312         if(end==str.end())
313                 throw InvalidParameterValue("Unmatched '['");
314
315         uint8_t mask[32]={0};
316         unsigned type=0;
317         bool range=false;
318         unsigned char first=0, last=0;
319         for(string::const_iterator i=iter; i!=end; ++i)
320         {
321                 unsigned char c=*i;
322                 if(range)
323                 {
324                         last=c;
325                         for(unsigned j=first; j<=c; ++j)
326                                 mask[j>>3]|=1<<(j&7);
327                         range=false;
328                         if(type<2)
329                                 type=2;
330                 }
331                 else if(c=='-' && i!=iter && end-i>1)
332                         range=true;
333                 else
334                 {
335                         first=c;
336                         mask[c>>3]|=1<<(c&7);
337                         if(type==0)
338                                 type=1;
339                         else
340                                 type=3;
341                 }
342         }
343
344         if(neg)
345                 result+=NEGATE;
346
347         if(type==1)
348         {
349                 result+=MATCH_CHAR;
350                 result+=first;
351         }
352         else if(type==2)
353         {
354                 result+=MATCH_RANGE;
355                 result+=first;
356                 result+=last;
357         }
358         else
359         {
360                 result+=MATCH_MASK;
361                 result.append(reinterpret_cast<char *>(mask), 32);
362         }
363
364         iter=end;
365         ++iter;
366
367         return result;
368 }
369
370 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
371 {
372         bool result=false;
373         list<RunContext> ctx;
374         ctx.push_back(RunContext());
375         ctx.front().citer=code.begin();
376         ctx.front().groups.resize(groups.size());
377
378         for(string::const_iterator i=begin;;)
379         {
380                 int c;
381                 if(i!=str.end())
382                         c=static_cast<unsigned char>(*i);
383                 else
384                         c=-1;
385
386                 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
387                 {
388                         bool terminate=false;
389                         bool negate_match=false;
390                         for(; j->citer!=code.end();)
391                         {
392                                 Instruction instr=static_cast<Instruction>(*j->citer++);
393
394                                 if(instr==NEGATE)
395                                         negate_match=true;
396                                 else if(instr==JUMP)
397                                 {
398                                         Offset offset=read_int<Offset>(j->citer);
399                                         j->citer+=offset;
400                                 }
401                                 else if(instr==ND_JUMP)
402                                 {
403                                         Offset offset=read_int<Offset>(j->citer);
404                                         ctx.push_back(*j);
405                                         ctx.back().citer+=offset;
406                                 }
407                                 else if(instr==GROUP_BEGIN)
408                                 {
409                                         Index n=read_int<Index>(j->citer);
410                                         if(!j->groups[n].match)
411                                                 j->groups[n].begin=i-str.begin();
412                                 }
413                                 else if(instr==GROUP_END)
414                                 {
415                                         Index n=read_int<Index>(j->citer);
416                                         if(!j->groups[n].match)
417                                         {
418                                                 j->groups[n].match=true;
419                                                 j->groups[n].end=i-str.begin();
420                                                 j->groups[n].length=j->groups[n].end-j->groups[n].begin;
421                                         }
422
423                                         if(n==0)
424                                         {
425                                                 result=true;
426                                                 bool better=false;
427                                                 for(unsigned k=0; (k<groups.size() && !better); ++k)
428                                                 {
429                                                         better=group_compare(j->groups[k], groups[k]);
430                                                         if(group_compare(groups[k], j->groups[k]))
431                                                                 break;
432                                                 }
433                                                 if(better)
434                                                         groups=j->groups;
435                                         }
436                                 }
437                                 else
438                                 {
439                                         bool match_result=false;
440                                         bool input_consumed=false;
441                                         if(instr==MATCH_BEGIN)
442                                                 match_result=(i==str.begin());
443                                         else if(instr==MATCH_END)
444                                                 match_result=(i==str.end());
445                                         else if(instr==MATCH_CHAR)
446                                         {
447                                                 match_result=(c==*j->citer++);
448                                                 input_consumed=true;
449                                         }
450                                         else if(instr==MATCH_RANGE)
451                                         {
452                                                 unsigned char first=*j->citer++;
453                                                 unsigned char last=*j->citer++;
454                                                 match_result=(c>=first && c<=last);
455                                                 input_consumed=true;
456                                         }
457                                         else if(instr==MATCH_MASK)
458                                         {
459                                                 uint8_t mask[32];
460                                                 for(unsigned k=0; k<32; ++k)
461                                                         mask[k]=*j->citer++;
462                                                 match_result=mask[c>>3]&(1<<(c&7));
463                                                 input_consumed=true;
464                                         }
465                                         else if(instr==MATCH_ANY)
466                                         {
467                                                 match_result=true;
468                                                 input_consumed=true;
469                                         }
470                                         else
471                                                 throw Exception("Invalid instruction");
472
473                                         if(match_result==negate_match)
474                                                 terminate=true;
475                                         negate_match=false;
476
477                                         if(input_consumed || terminate)
478                                                 break;
479                                 }
480                         }
481
482                         if(terminate || j->citer==code.end())
483                                 j=ctx.erase(j);
484                         else
485                                 ++j;
486                 }
487
488                 if(i==str.end() || ctx.empty())
489                         break;
490                 ++i;
491         }
492
493         return result;
494 }
495
496 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
497 {
498         if(!g1.match)
499                 return false;
500
501         // Any match is better than no match
502         if(!g2.match)
503                 return true;
504
505         // Earlier match is better
506         if(g1.begin<g2.begin)
507                 return true;
508         if(g2.begin>g2.begin)
509                 return false;
510
511         // Longer match at same position is better
512         return g1.end>g2.end;
513 }
514
515 string Regex::disassemble_instruction(Code::const_iterator &i) const
516 {
517         Instruction instr=static_cast<Instruction>(*i++);
518
519         ostringstream result;
520         switch(instr)
521         {
522         case JUMP:
523                 {
524                         Offset offset=read_int<Offset>(i);
525                         result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
526                 }
527                 break;
528         case ND_JUMP:
529                 {
530                         Offset offset=read_int<Offset>(i);
531                         result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
532                 }
533                 break;
534         case GROUP_BEGIN:
535                 result<<"GROUP_BEGIN "<<read_int<Index>(i);
536                 break;
537         case GROUP_END:
538                 result<<"GROUP_END "<<read_int<Index>(i);
539                 break;
540         case NEGATE:
541                 result<<"NEGATE";
542                 break;
543         case MATCH_BEGIN:
544                 result<<"MATCH_BEGIN";
545                 break;
546         case MATCH_END:
547                 result<<"MATCH_END";
548                 break;
549         case MATCH_CHAR:
550                 {
551                         char c=*i++;
552                         result<<"MATCH_CHAR ";
553                         if(c>=0x20 && c<=0x7E)
554                                 result<<'\''<<c<<'\'';
555                         else
556                                 result<<(static_cast<int>(c)&0xFF);
557                 }
558                 break;
559         case MATCH_RANGE:
560                 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
561                 result<<'-'<<(static_cast<int>(*i++)&0xFF);
562                 break;
563         case MATCH_MASK:
564                 result<<"MATCH_MASK";
565                 for(unsigned j=0; j<32; ++j)
566                         result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
567                 break;
568         case MATCH_ANY:
569                 result<<"MATCH_ANY";
570                 break;
571         default:
572                 result<<"UNKNOWN "<<instr;
573         }
574
575         return result.str();
576 }
577
578 } // namespace Msp