]> git.tdb.fi Git - libs/core.git/blob - source/regex.cpp
320705e8e33dc1cecbca03a26adab78e9d498805
[libs/core.git] / source / regex.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7 #include <stack>
8 #include <msp/core/error.h>
9 #include "formatter.h"
10 #include "regex.h"
11
12 using namespace std;
13
14 namespace {
15
16 /**
17 Writes an integer to a Regex code string, in little-endian order.
18 */
19 template<typename T>
20 void write_int(T n, Msp::Regex::Code &code)
21 {
22         for(unsigned i=0; i<sizeof(T); ++i)
23                 code+=(n>>i*8)&0xFF;
24 }
25
26 /**
27 Reads an integer from a Regex code stream, in little-endian order.
28 */
29 template<typename T>
30 T read_int(Msp::Regex::Code::const_iterator &c)
31 {
32         T result=0;
33         for(unsigned i=0; i<sizeof(T); ++i)
34                 result+=static_cast<unsigned char>(*c++)<<i*8;
35         return result;
36 }
37
38 }
39
40 namespace Msp {
41
42 Regex::Regex(const string &expr)
43 {
44         n_groups=0;
45         string::const_iterator iter=expr.begin();
46         code=compile(expr, iter, n_groups, false);
47         ++n_groups;
48 }
49
50 RegMatch Regex::match(const string &str) const
51 {
52         RegMatch::GroupArray groups(n_groups);
53
54         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
55                 if(run(str, i, groups))
56                         return RegMatch(str, groups);
57
58         return RegMatch();
59 }
60
61 string Regex::disassemble() const
62 {
63         ostringstream ss;
64
65         for(Code::const_iterator i=code.begin(); i!=code.end();)
66         {
67                 Code::const_iterator j=i;
68                 Offset offset=i-code.begin();
69                 string decompiled=disassemble_instruction(i);
70                 string bytes;
71                 for(; j!=i; ++j)
72                         bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
73                 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
74                 if(bytes.size()>9)
75                         ss<<"\n"<<Fmt("%15s");
76                 ss<<"  "<<decompiled<<'\n';
77         }
78
79         return ss.str();
80 }
81
82 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
83 {
84         bool has_branches=false;
85         unsigned level=0;
86         bool escape=false;
87         string::const_iterator end;
88         for(end=iter; end!=expr.end(); ++end)
89         {
90                 if(escape)
91                         escape=false;
92                 else if(*end=='\\')
93                         escape=true;
94                 else if(*end=='(')
95                         ++level;
96                 else if(*end==')')
97                 {
98                         if(level==0)
99                         {
100                                 if(group==0)
101                                         throw InvalidParameterValue("Unexpected )");
102                                 else
103                                         break;
104                         }
105                         --level;
106                 }
107                 else if(*end=='|')
108                 {
109                         if(branch)
110                                 break;
111                         else if(level==0)
112                                 has_branches=true;
113                 }
114         }
115
116         if(level>0)
117                 throw InvalidParameterValue("Unmatched (");
118
119         Code result;
120
121         unsigned this_group=group;
122         if(!branch)
123         {
124                 result+=GROUP_BEGIN;
125                 write_int<Index>(this_group, result);
126         }
127
128         const unsigned jump_size=1+sizeof(Offset);
129
130         if(!has_branches)
131         {
132                 for(string::const_iterator i=iter; i!=end;)
133                 {
134                         Code atom=parse_atom(expr, i, group);
135
136                         Count repeat_min=1;
137                         Count repeat_max=1;
138                         parse_repeat(i, repeat_min, repeat_max);
139
140                         for(unsigned j=0; j<repeat_min; ++j)
141                                 result+=atom;
142                         if(repeat_max==numeric_limits<Count>::max())
143                         {
144                                 if(repeat_min==0)
145                                 {
146                                         result+=ND_JUMP;
147                                         write_int<Offset>(atom.size()+jump_size, result);
148                                         result+=atom;
149                                 }
150                                 result+=ND_JUMP;
151                                 write_int<Offset>(-(atom.size()+jump_size), result);
152                         }
153                         else if(repeat_max>repeat_min)
154                         {
155                                 for(unsigned j=repeat_min; j<repeat_max; ++j)
156                                 {
157                                         result+=ND_JUMP;
158                                         write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
159                                         result+=atom;
160                                 }
161                         }
162                 }
163         }
164         else
165         {
166                 list<Code> branches;
167                 for(string::const_iterator i=iter;;)
168                 {
169                         branches.push_back(compile(expr, i, group, true));
170                         if(i==end)
171                                 break;
172                         ++i;
173                 }
174
175                 unsigned n_branches=branches.size();
176
177                 Offset offset=(n_branches-1)*jump_size+branches.front().size();
178                 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
179                 {
180                         result+=ND_JUMP;
181                         write_int<Offset>(offset, result);
182                         offset+=i->size();
183                 }
184
185                 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
186                 {
187                         result+=*i;
188                         offset-=i->size()+jump_size;
189                         ++i;
190                         if(i!=branches.end())
191                         {
192                                 result+=JUMP;
193                                 write_int<Offset>(offset, result);
194                         }
195                 }
196         }
197
198         if(!branch)
199         {
200                 result+=GROUP_END;
201                 write_int<Index>(this_group, result);
202         }
203
204         iter=end;
205
206         return result;
207 }
208
209 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
210 {
211         Code result;
212
213         if(i==expr.end())
214                 return result;
215
216         bool flag=false;
217         if(*i=='\\')
218         {
219                 if(++i==expr.end())
220                         throw InvalidParameterValue("Stray backslash");
221                 flag=true;
222         }
223
224         if(!flag)
225         {
226                 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
227                         throw InvalidParameterValue("Invalid atom");
228                 else if(*i=='[')
229                         return parse_brackets(expr, i);
230                 else if(*i=='.')
231                         result+=MATCH_ANY;
232                 else if(*i=='^')
233                         result+=MATCH_BEGIN;
234                 else if(*i=='$')
235                         result+=MATCH_END;
236                 else if(*i=='(')
237                 {
238                         ++group;
239                         result=compile(expr, ++i, group, false);
240                 }
241                 else
242                         flag=true;
243         }
244
245         if(flag)
246         {
247                 if(static_cast<unsigned char>(*i)<=LAST_INSTRUCTION_)
248                         result+=MATCH_CHAR;
249                 result+=*i;
250         }
251
252         ++i;
253
254         return result;
255 }
256
257 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
258 {
259         if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
260                 return false;
261
262         if(*i=='*' || *i=='+')
263                 rmax=numeric_limits<Count>::max();
264         if(*i=='*' || *i=='?')
265                 rmin=0;
266         if(*i=='{')
267         {
268                 rmin=0;
269                 for(++i; isdigit(*i); ++i)
270                         rmin=rmin*10+(*i-'0');
271
272                 if(*i==',')
273                 {
274                         ++i;
275                         if(*i!='}')
276                         {
277                                 rmax=0;
278                                 for(; isdigit(*i); ++i)
279                                         rmax=rmax*10+(*i-'0');
280                                 if(rmax<rmin)
281                                         throw InvalidParameterValue("Invalid bound");
282                         }
283                         else
284                                 rmax=numeric_limits<Count>::max();
285                 }
286                 else
287                         rmax=rmin;
288                 if(*i!='}')
289                         throw InvalidParameterValue("Invalid bound");
290         }
291
292         ++i;
293
294         return true;
295 }
296
297 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
298 {
299         Code result;
300
301         ++iter;
302         bool neg=false;
303         if(*iter=='^')
304         {
305                 neg=true;
306                 ++iter;
307         }
308
309         string::const_iterator end=iter;
310         for(; (end!=str.end() && (end==iter || *end!=']')); ++end);
311         if(end==str.end())
312                 throw InvalidParameterValue("Unmatched '['");
313
314         uint8_t mask[32]={0};
315         unsigned type=0;
316         bool range=false;
317         unsigned char first=0, last=0;
318         for(string::const_iterator i=iter; i!=end; ++i)
319         {
320                 unsigned char c=*i;
321                 if(range)
322                 {
323                         last=c;
324                         for(unsigned j=first; j<=c; ++j)
325                                 mask[j>>3]|=1<<(j&7);
326                         range=false;
327                         if(type<2)
328                                 type=2;
329                 }
330                 else if(c=='-' && i!=iter && end-i>1)
331                         range=true;
332                 else
333                 {
334                         first=c;
335                         mask[c>>3]|=1<<(c&7);
336                         if(type==0)
337                                 type=1;
338                         else
339                                 type=3;
340                 }
341         }
342
343         if(neg)
344                 result+=NEGATE;
345
346         if(type==1)
347         {
348                 result+=MATCH_CHAR;
349                 result+=first;
350         }
351         else if(type==2)
352         {
353                 result+=MATCH_RANGE;
354                 result+=first;
355                 result+=last;
356         }
357         else
358         {
359                 result+=MATCH_MASK;
360                 result.append(reinterpret_cast<char *>(mask), 32);
361         }
362
363         iter=end;
364         ++iter;
365
366         return result;
367 }
368
369 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
370 {
371         bool result=false;
372         list<RunContext> ctx;
373         ctx.push_back(RunContext());
374         ctx.front().citer=code.begin();
375         ctx.front().groups.resize(groups.size());
376
377         for(string::const_iterator i=begin;;)
378         {
379                 int c;
380                 if(i!=str.end())
381                         c=static_cast<unsigned char>(*i);
382                 else
383                         c=-1;
384
385                 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
386                 {
387                         bool terminate=false;
388                         bool negate_match=false;
389                         for(; j->citer!=code.end();)
390                         {
391                                 Instruction instr=static_cast<Instruction>(*j->citer);
392                                 if(instr>LAST_INSTRUCTION_)
393                                         instr=MATCH_CHAR;
394                                 else
395                                         ++j->citer;
396
397                                 if(instr==NEGATE)
398                                         negate_match=true;
399                                 else if(instr==JUMP)
400                                 {
401                                         Offset offset=read_int<Offset>(j->citer);
402                                         j->citer+=offset;
403                                 }
404                                 else if(instr==ND_JUMP)
405                                 {
406                                         Offset offset=read_int<Offset>(j->citer);
407                                         ctx.push_back(*j);
408                                         ctx.back().citer+=offset;
409                                 }
410                                 else if(instr==GROUP_BEGIN)
411                                 {
412                                         Index n=read_int<Index>(j->citer);
413                                         if(!j->groups[n].match)
414                                                 j->groups[n].begin=i-str.begin();
415                                 }
416                                 else if(instr==GROUP_END)
417                                 {
418                                         Index n=read_int<Index>(j->citer);
419                                         if(!j->groups[n].match)
420                                         {
421                                                 j->groups[n].match=true;
422                                                 j->groups[n].end=i-str.begin();
423                                                 j->groups[n].length=j->groups[n].end-j->groups[n].begin;
424                                         }
425
426                                         if(n==0)
427                                         {
428                                                 result=true;
429                                                 bool better=false;
430                                                 for(unsigned k=0; (k<groups.size() && !better); ++k)
431                                                 {
432                                                         better=group_compare(j->groups[k], groups[k]);
433                                                         if(group_compare(groups[k], j->groups[k]))
434                                                                 break;
435                                                 }
436                                                 if(better)
437                                                         groups=j->groups;
438                                         }
439                                 }
440                                 else
441                                 {
442                                         bool match_result=false;
443                                         bool input_consumed=false;
444                                         if(instr==MATCH_BEGIN)
445                                                 match_result=(i==str.begin());
446                                         else if(instr==MATCH_END)
447                                                 match_result=(i==str.end());
448                                         else if(instr==MATCH_CHAR)
449                                         {
450                                                 match_result=(c==*j->citer++);
451                                                 input_consumed=true;
452                                         }
453                                         else if(instr==MATCH_RANGE)
454                                         {
455                                                 unsigned char first=*j->citer++;
456                                                 unsigned char last=*j->citer++;
457                                                 match_result=(c>=first && c<=last);
458                                                 input_consumed=true;
459                                         }
460                                         else if(instr==MATCH_MASK)
461                                         {
462                                                 uint8_t mask[32];
463                                                 for(unsigned k=0; k<32; ++k)
464                                                         mask[k]=*j->citer++;
465                                                 match_result=mask[c>>3]&(1<<(c&7));
466                                                 input_consumed=true;
467                                         }
468                                         else if(instr==MATCH_ANY)
469                                         {
470                                                 match_result=true;
471                                                 input_consumed=true;
472                                         }
473                                         else
474                                                 throw Exception("Invalid instruction");
475
476                                         if(match_result==negate_match)
477                                                 terminate=true;
478                                         negate_match=false;
479
480                                         if(input_consumed || terminate)
481                                                 break;
482                                 }
483                         }
484
485                         if(terminate || j->citer==code.end())
486                                 j=ctx.erase(j);
487                         else
488                                 ++j;
489                 }
490
491                 if(i==str.end() || ctx.empty())
492                         break;
493                 ++i;
494         }
495
496         return result;
497 }
498
499 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
500 {
501         if(!g1.match)
502                 return false;
503
504         // Any match is better than no match
505         if(!g2.match)
506                 return true;
507
508         // Earlier match is better
509         if(g1.begin<g2.begin)
510                 return true;
511         if(g2.begin>g2.begin)
512                 return false;
513
514         // Longer match at same position is better
515         return g1.end>g2.end;
516 }
517
518 string Regex::disassemble_instruction(Code::const_iterator &i) const
519 {
520         Instruction instr=static_cast<Instruction>(*i);
521         if(instr>=LAST_INSTRUCTION_)
522                 instr=MATCH_CHAR;
523         else
524                 ++i;
525
526         ostringstream result;
527         switch(instr)
528         {
529         case JUMP:
530                 {
531                         Offset offset=read_int<Offset>(i);
532                         result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
533                 }
534                 break;
535         case ND_JUMP:
536                 {
537                         Offset offset=read_int<Offset>(i);
538                         result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
539                 }
540                 break;
541         case GROUP_BEGIN:
542                 result<<"GROUP_BEGIN "<<read_int<Index>(i);
543                 break;
544         case GROUP_END:
545                 result<<"GROUP_END "<<read_int<Index>(i);
546                 break;
547         case NEGATE:
548                 result<<"NEGATE";
549                 break;
550         case MATCH_BEGIN:
551                 result<<"MATCH_BEGIN";
552                 break;
553         case MATCH_END:
554                 result<<"MATCH_END";
555                 break;
556         case MATCH_CHAR:
557                 {
558                         char c=*i++;
559                         result<<"MATCH_CHAR ";
560                         if(c>=0x20 && c<=0x7E)
561                                 result<<'\''<<c<<'\'';
562                         else
563                                 result<<(static_cast<int>(c)&0xFF);
564                 }
565                 break;
566         case MATCH_RANGE:
567                 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
568                 result<<'-'<<(static_cast<int>(*i++)&0xFF);
569                 break;
570         case MATCH_MASK:
571                 result<<"MATCH_MASK";
572                 for(unsigned j=0; j<32; ++j)
573                         result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
574                 break;
575         case MATCH_ANY:
576                 result<<"MATCH_ANY";
577                 break;
578         case FIRST_INSTRUCTION_:
579         case LAST_INSTRUCTION_:
580                 result<<"UNKNOWN "<<instr;
581         }
582
583         return result.str();
584 }
585
586 } // namespace Msp