]> git.tdb.fi Git - libs/core.git/blob - source/strings/regex.cpp
Merge branch 'strings-master'
[libs/core.git] / source / strings / regex.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7
8 #include <stack>
9 #include <limits>
10 #include <msp/core/except.h>
11 #include "formatter.h"
12 #include "regex.h"
13
14 using namespace std;
15
16 namespace {
17
18 /** Writes an integer to a Regex code string, in little-endian order. */
19 template<typename T>
20 void write_int(T n, Msp::Regex::Code &code)
21 {
22         for(unsigned i=0; i<sizeof(T); ++i)
23                 code += (n>>i*8)&0xFF;
24 }
25
26 /** Reads an integer from a Regex code stream, in little-endian order. */
27 template<typename T>
28 T read_int(Msp::Regex::Code::const_iterator &c)
29 {
30         T result = 0;
31         for(unsigned i=0; i<sizeof(T); ++i)
32                 result += static_cast<unsigned char>(*c++)<<i*8;
33         return result;
34 }
35
36 }
37
38
39 namespace Msp {
40
41 Regex::Regex(const string &expr)
42 {
43         n_groups = 0;
44         string::const_iterator iter = expr.begin();
45         code = compile(expr, iter, n_groups, false);
46         ++n_groups;
47 }
48
49 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
50 {
51         bool has_branches = false;
52         unsigned level = 0;
53         bool escape = false;
54         unsigned bracket = 0;
55         string::const_iterator end;
56         for(end=iter; end!=expr.end(); ++end)
57         {
58                 if(escape)
59                         escape = false;
60                 else if(bracket)
61                 {
62                         if(bracket==3 && *end==']')
63                                 bracket = 0;
64                         else if(bracket==1 && *end=='^')
65                                 bracket = 2;
66                         else
67                                 bracket = 3;
68                 }
69                 else if(*end=='\\')
70                         escape = true;
71                 else if(*end=='(')
72                         ++level;
73                 else if(*end==')')
74                 {
75                         if(level==0)
76                         {
77                                 if(group==0)
78                                         throw InvalidParameterValue("Unexpected )");
79                                 else
80                                         break;
81                         }
82                         --level;
83                 }
84                 else if(*end=='|' && level==0)
85                 {
86                         if(branch)
87                                 break;
88                         else
89                                 has_branches = true;
90                 }
91                 else if(*end=='[')
92                         bracket = 1;
93         }
94
95         if(level>0)
96                 throw InvalidParameterValue("Unmatched (");
97
98         Code result;
99
100         unsigned this_group = group;
101         if(!branch)
102         {
103                 result += GROUP_BEGIN;
104                 write_int<Index>(this_group, result);
105         }
106
107         const unsigned jump_size = 1+sizeof(Offset);
108
109         if(!has_branches)
110         {
111                 for(string::const_iterator i=iter; i!=end;)
112                 {
113                         Code atom = parse_atom(expr, i, group);
114
115                         Count repeat_min = 1;
116                         Count repeat_max = 1;
117                         parse_repeat(i, repeat_min, repeat_max);
118
119                         for(unsigned j=0; j<repeat_min; ++j)
120                                 result += atom;
121                         if(repeat_max==numeric_limits<Count>::max())
122                         {
123                                 if(repeat_min==0)
124                                 {
125                                         result += ND_JUMP;
126                                         write_int<Offset>(atom.size()+jump_size, result);
127                                         result += atom;
128                                 }
129                                 result += ND_JUMP;
130                                 write_int<Offset>(-(atom.size()+jump_size), result);
131                         }
132                         else if(repeat_max>repeat_min)
133                         {
134                                 for(unsigned j=repeat_min; j<repeat_max; ++j)
135                                 {
136                                         result += ND_JUMP;
137                                         write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
138                                         result += atom;
139                                 }
140                         }
141                 }
142         }
143         else
144         {
145                 list<Code> branches;
146                 for(string::const_iterator i=iter;;)
147                 {
148                         branches.push_back(compile(expr, i, group, true));
149                         if(i==end)
150                                 break;
151                         ++i;
152                 }
153
154                 unsigned n_branches = branches.size();
155
156                 Offset offset = (n_branches-1)*jump_size+branches.front().size();
157                 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
158                 {
159                         result += ND_JUMP;
160                         write_int<Offset>(offset, result);
161                         offset += i->size();
162                 }
163
164                 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
165                 {
166                         result += *i;
167                         offset -= i->size()+jump_size;
168                         ++i;
169                         if(i!=branches.end())
170                         {
171                                 result += JUMP;
172                                 write_int<Offset>(offset, result);
173                         }
174                 }
175         }
176
177         if(!branch)
178         {
179                 result += GROUP_END;
180                 write_int<Index>(this_group, result);
181         }
182
183         iter = end;
184
185         return result;
186 }
187
188 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
189 {
190         Code result;
191
192         if(i==expr.end())
193                 return result;
194
195         bool flag = false;
196         if(*i=='\\')
197         {
198                 if(++i==expr.end())
199                         throw InvalidParameterValue("Stray backslash");
200                 flag = true;
201         }
202
203         if(!flag)
204         {
205                 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
206                         throw InvalidParameterValue("Invalid atom");
207                 else if(*i=='[')
208                         return parse_brackets(expr, i);
209                 else if(*i=='.')
210                         result += MATCH_ANY;
211                 else if(*i=='^')
212                         result += MATCH_BEGIN;
213                 else if(*i=='$')
214                         result += MATCH_END;
215                 else if(*i=='(')
216                 {
217                         ++group;
218                         result = compile(expr, ++i, group, false);
219                 }
220                 else
221                         flag = true;
222         }
223
224         if(flag)
225         {
226                 result += MATCH_CHAR;
227                 result += *i;
228         }
229
230         ++i;
231
232         return result;
233 }
234
235 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
236 {
237         if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
238                 return false;
239
240         if(*i=='*' || *i=='+')
241                 rmax = numeric_limits<Count>::max();
242         if(*i=='*' || *i=='?')
243                 rmin = 0;
244         if(*i=='{')
245         {
246                 rmin = 0;
247                 for(++i; isdigit(*i); ++i)
248                         rmin = rmin*10+(*i-'0');
249
250                 if(*i==',')
251                 {
252                         ++i;
253                         if(*i!='}')
254                         {
255                                 rmax = 0;
256                                 for(; isdigit(*i); ++i)
257                                         rmax = rmax*10+(*i-'0');
258                                 if(rmax<rmin)
259                                         throw InvalidParameterValue("Invalid bound");
260                         }
261                         else
262                                 rmax = numeric_limits<Count>::max();
263                 }
264                 else
265                         rmax = rmin;
266                 if(*i!='}')
267                         throw InvalidParameterValue("Invalid bound");
268         }
269
270         ++i;
271
272         return true;
273 }
274
275 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
276 {
277         Code result;
278
279         ++iter;
280         bool neg = false;
281         if(*iter=='^')
282         {
283                 neg = true;
284                 ++iter;
285         }
286
287         string::const_iterator end = iter;
288         for(; (end!=str.end() && (end==iter || *end!=']')); ++end) ;
289         if(end==str.end())
290                 throw InvalidParameterValue("Unmatched '['");
291
292         unsigned char mask[32] = {0};
293         unsigned type = 0;
294         bool range = false;
295         unsigned char first=0, last = 0;
296         for(string::const_iterator i=iter; i!=end; ++i)
297         {
298                 unsigned char c = *i;
299                 if(range)
300                 {
301                         last = c;
302                         for(unsigned j=first; j<=c; ++j)
303                                 mask[j>>3] |= 1<<(j&7);
304                         range = false;
305                         if(type<2)
306                                 type = 2;
307                 }
308                 else if(c=='-' && i!=iter && end-i>1)
309                         range = true;
310                 else
311                 {
312                         first = c;
313                         mask[c>>3] |= 1<<(c&7);
314                         if(type==0)
315                                 type = 1;
316                         else
317                                 type = 3;
318                 }
319         }
320
321         if(neg)
322                 result += NEGATE;
323
324         if(type==1)
325         {
326                 result += MATCH_CHAR;
327                 result += first;
328         }
329         else if(type==2)
330         {
331                 result += MATCH_RANGE;
332                 result += first;
333                 result += last;
334         }
335         else
336         {
337                 result += MATCH_MASK;
338                 result.append(reinterpret_cast<char *>(mask), 32);
339         }
340
341         iter = end;
342         ++iter;
343
344         return result;
345 }
346
347 RegMatch Regex::match(const string &str) const
348 {
349         RegMatch::GroupArray groups(n_groups);
350
351         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
352                 if(run(str, i, groups))
353                         return RegMatch(str, groups);
354
355         return RegMatch();
356 }
357
358 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
359 {
360         bool result = false;
361         list<RunContext> ctx;
362         ctx.push_back(RunContext());
363         ctx.front().citer = code.begin();
364         ctx.front().groups.resize(groups.size());
365
366         for(string::const_iterator i=begin;;)
367         {
368                 int c;
369                 if(i!=str.end())
370                         c = static_cast<unsigned char>(*i);
371                 else
372                         c = -1;
373
374                 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
375                 {
376                         bool terminate = false;
377                         bool negate_match = false;
378                         for(; j->citer!=code.end();)
379                         {
380                                 Instruction instr = static_cast<Instruction>(*j->citer++);
381
382                                 if(instr==NEGATE)
383                                         negate_match = true;
384                                 else if(instr==JUMP)
385                                 {
386                                         Offset offset = read_int<Offset>(j->citer);
387                                         j->citer += offset;
388                                 }
389                                 else if(instr==ND_JUMP)
390                                 {
391                                         Offset offset = read_int<Offset>(j->citer);
392                                         ctx.push_back(*j);
393                                         ctx.back().citer += offset;
394                                 }
395                                 else if(instr==GROUP_BEGIN)
396                                 {
397                                         Index n = read_int<Index>(j->citer);
398                                         if(!j->groups[n].match)
399                                                 j->groups[n].begin = i-str.begin();
400                                 }
401                                 else if(instr==GROUP_END)
402                                 {
403                                         Index n = read_int<Index>(j->citer);
404                                         if(!j->groups[n].match)
405                                         {
406                                                 j->groups[n].match = true;
407                                                 j->groups[n].end = i-str.begin();
408                                                 j->groups[n].length = j->groups[n].end-j->groups[n].begin;
409                                         }
410
411                                         if(n==0)
412                                         {
413                                                 result = true;
414                                                 bool better = false;
415                                                 for(unsigned k=0; (k<groups.size() && !better); ++k)
416                                                 {
417                                                         better = group_compare(j->groups[k], groups[k]);
418                                                         if(group_compare(groups[k], j->groups[k]))
419                                                                 break;
420                                                 }
421                                                 if(better)
422                                                         groups = j->groups;
423                                         }
424                                 }
425                                 else
426                                 {
427                                         bool match_result = false;
428                                         bool input_consumed = false;
429                                         if(instr==MATCH_BEGIN)
430                                                 match_result = (i==str.begin());
431                                         else if(instr==MATCH_END)
432                                                 match_result = (i==str.end());
433                                         else if(instr==MATCH_CHAR)
434                                         {
435                                                 match_result = (c==*j->citer++);
436                                                 input_consumed = true;
437                                         }
438                                         else if(instr==MATCH_RANGE)
439                                         {
440                                                 unsigned char first = *j->citer++;
441                                                 unsigned char last = *j->citer++;
442                                                 match_result = (c>=first && c<=last);
443                                                 input_consumed = true;
444                                         }
445                                         else if(instr==MATCH_MASK)
446                                         {
447                                                 if(c>=0 && c<=0xFF)
448                                                 {
449                                                         unsigned char m = *(j->citer+(c>>3));
450                                                         match_result = m&(1<<(c&7));
451                                                 }
452                                                 input_consumed = true;
453                                                 j->citer += 32;
454                                         }
455                                         else if(instr==MATCH_ANY)
456                                         {
457                                                 match_result = true;
458                                                 input_consumed = true;
459                                         }
460                                         else
461                                                 throw Exception("Invalid instruction");
462
463                                         if(match_result==negate_match)
464                                                 terminate = true;
465                                         negate_match = false;
466
467                                         if(input_consumed || terminate)
468                                                 break;
469                                 }
470                         }
471
472                         if(terminate || j->citer==code.end())
473                                 j = ctx.erase(j);
474                         else
475                                 ++j;
476                 }
477
478                 if(i==str.end() || ctx.empty())
479                         break;
480                 ++i;
481         }
482
483         return result;
484 }
485
486 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
487 {
488         if(!g1.match)
489                 return false;
490
491         // Any match is better than no match
492         if(!g2.match)
493                 return true;
494
495         // Earlier match is better
496         if(g1.begin<g2.begin)
497                 return true;
498         if(g2.begin>g2.begin)
499                 return false;
500
501         // Longer match at same position is better
502         return g1.end>g2.end;
503 }
504
505 string Regex::disassemble() const
506 {
507         ostringstream ss;
508
509         for(Code::const_iterator i=code.begin(); i!=code.end();)
510         {
511                 Code::const_iterator j = i;
512                 Offset offset = i-code.begin();
513                 string decompiled = disassemble_instruction(i);
514                 string bytes;
515                 for(; j!=i; ++j)
516                         bytes += format(" %02X", static_cast<int>(*j)&0xFF);
517                 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
518                 if(bytes.size()>9)
519                         ss<<"\n"<<Fmt("%15s");
520                 ss<<"  "<<decompiled<<'\n';
521         }
522
523         return ss.str();
524 }
525
526 string Regex::disassemble_instruction(Code::const_iterator &i) const
527 {
528         Instruction instr = static_cast<Instruction>(*i++);
529
530         ostringstream result;
531         switch(instr)
532         {
533         case JUMP:
534                 {
535                         Offset offset = read_int<Offset>(i);
536                         result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
537                 }
538                 break;
539         case ND_JUMP:
540                 {
541                         Offset offset = read_int<Offset>(i);
542                         result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
543                 }
544                 break;
545         case GROUP_BEGIN:
546                 result<<"GROUP_BEGIN "<<read_int<Index>(i);
547                 break;
548         case GROUP_END:
549                 result<<"GROUP_END "<<read_int<Index>(i);
550                 break;
551         case NEGATE:
552                 result<<"NEGATE";
553                 break;
554         case MATCH_BEGIN:
555                 result<<"MATCH_BEGIN";
556                 break;
557         case MATCH_END:
558                 result<<"MATCH_END";
559                 break;
560         case MATCH_CHAR:
561                 {
562                         char c = *i++;
563                         result<<"MATCH_CHAR ";
564                         if(c>=0x20 && c<=0x7E)
565                                 result<<'\''<<c<<'\'';
566                         else
567                                 result<<(static_cast<int>(c)&0xFF);
568                 }
569                 break;
570         case MATCH_RANGE:
571                 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
572                 result<<'-'<<(static_cast<int>(*i++)&0xFF);
573                 break;
574         case MATCH_MASK:
575                 result<<"MATCH_MASK";
576                 for(unsigned j=0; j<32; ++j)
577                         result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
578                 break;
579         case MATCH_ANY:
580                 result<<"MATCH_ANY";
581                 break;
582         default:
583                 result<<"UNKNOWN "<<instr;
584         }
585
586         return result.str();
587 }
588
589 } // namespace Msp