]> git.tdb.fi Git - libs/core.git/blob - source/regex.cpp
Add Id tags and copyright notices to a few files that were missing them
[libs/core.git] / source / regex.cpp
1 /* $Id$
2
3 This file is part of libmspstrings
4 Copyright © 2007 Mikko Rasa
5 Distributed under the LGPL
6 */
7 #include <stack>
8 #include <msp/core/error.h>
9 #include "formatter.h"
10 #include "regex.h"
11
12 using namespace std;
13
14 #include <iostream>
15
16 namespace {
17
18 /**
19 Writes an integer to a Regex code string, in little-endian order.
20 */
21 template<typename T>
22 void write_int(T n, Msp::Regex::Code &code)
23 {
24         for(unsigned i=0; i<sizeof(T); ++i)
25                 code+=(n>>i*8)&0xFF;
26 }
27
28 /**
29 Reads an integer from a Regex code stream, in little-endian order.
30 */
31 template<typename T>
32 T read_int(Msp::Regex::Code::const_iterator &c)
33 {
34         T result=0;
35         for(unsigned i=0; i<sizeof(T); ++i)
36                 result+=static_cast<unsigned char>(*c++)<<i*8;
37         return result;
38 }
39
40 }
41
42 namespace Msp {
43
44 Regex::Regex(const string &expr)
45 {
46         n_groups=0;
47         string::const_iterator iter=expr.begin();
48         code=compile(expr, iter, n_groups, false);
49         ++n_groups;
50 }
51
52 RegMatch Regex::match(const string &str) const
53 {
54         RegMatch::GroupArray groups(n_groups);
55
56         for(string::const_iterator i=str.begin(); i!=str.end(); ++i)
57                 if(run(str, i, groups))
58                         return RegMatch(str, groups);
59
60         return RegMatch();
61 }
62
63 string Regex::disassemble() const
64 {
65         ostringstream ss;
66
67         for(Code::const_iterator i=code.begin(); i!=code.end();)
68         {
69                 Code::const_iterator j=i;
70                 Offset offset=i-code.begin();
71                 string decompiled=disassemble_instruction(i);
72                 string bytes;
73                 for(; j!=i; ++j)
74                         bytes+=format(" %02X", static_cast<int>(*j)&0xFF);
75                 ss<<Fmt("%3d")<<offset<<':'<<Fmt("%-9s")<<bytes;
76                 if(bytes.size()>9)
77                         ss<<"\n"<<Fmt("%15s");
78                 ss<<"  "<<decompiled<<'\n';
79         }
80
81         return ss.str();
82 }
83
84 Regex::Code Regex::compile(const string &expr, string::const_iterator &iter, unsigned &group, bool branch)
85 {
86         bool has_branches=false;
87         unsigned level=0;
88         bool escape=false;
89         string::const_iterator end;
90         for(end=iter; end!=expr.end(); ++end)
91         {
92                 if(escape)
93                         escape=false;
94                 else if(*end=='\\')
95                         escape=true;
96                 else if(*end=='(')
97                         ++level;
98                 else if(*end==')')
99                 {
100                         if(level==0)
101                         {
102                                 if(group==0)
103                                         throw InvalidParameterValue("Unexpected )");
104                                 else
105                                         break;
106                         }
107                         --level;
108                 }
109                 else if(*end=='|')
110                 {
111                         if(branch)
112                                 break;
113                         else if(level==0)
114                                 has_branches=true;
115                 }
116         }
117
118         if(level>0)
119                 throw InvalidParameterValue("Unmatched (");
120
121         Code result;
122
123         unsigned this_group=group;
124         if(!branch)
125         {
126                 result+=GROUP_BEGIN;
127                 write_int<Index>(this_group, result);
128         }
129
130         const unsigned jump_size=1+sizeof(Offset);
131
132         if(!has_branches)
133         {
134                 for(string::const_iterator i=iter; i!=end;)
135                 {
136                         Code atom=parse_atom(expr, i, group);
137
138                         Count repeat_min=1;
139                         Count repeat_max=1;
140                         parse_repeat(i, repeat_min, repeat_max);
141
142                         for(unsigned j=0; j<repeat_min; ++j)
143                                 result+=atom;
144                         if(repeat_max==numeric_limits<Count>::max())
145                         {
146                                 if(repeat_min==0)
147                                 {
148                                         result+=ND_JUMP;
149                                         write_int<Offset>(atom.size()+jump_size, result);
150                                         result+=atom;
151                                 }
152                                 result+=ND_JUMP;
153                                 write_int<Offset>(-(atom.size()+jump_size), result);
154                         }
155                         else if(repeat_max>repeat_min)
156                         {
157                                 for(unsigned j=repeat_min; j<repeat_max; ++j)
158                                 {
159                                         result+=ND_JUMP;
160                                         write_int<Offset>((repeat_max-j)*(atom.size()+jump_size)-jump_size, result);
161                                         result+=atom;
162                                 }
163                         }
164                 }
165         }
166         else
167         {
168                 list<Code> branches;
169                 for(string::const_iterator i=iter;;)
170                 {
171                         branches.push_back(compile(expr, i, group, true));
172                         if(i==end)
173                                 break;
174                         ++i;
175                 }
176
177                 unsigned n_branches=branches.size();
178
179                 Offset offset=(n_branches-1)*jump_size+branches.front().size();
180                 for(list<Code>::iterator i=++branches.begin(); i!=branches.end(); ++i)
181                 {
182                         result+=ND_JUMP;
183                         write_int<Offset>(offset, result);
184                         offset+=i->size();
185                 }
186
187                 for(list<Code>::iterator i=branches.begin(); i!=branches.end();)
188                 {
189                         result+=*i;
190                         offset-=i->size()+jump_size;
191                         ++i;
192                         if(i!=branches.end())
193                         {
194                                 result+=JUMP;
195                                 write_int<Offset>(offset, result);
196                         }
197                 }
198         }
199
200         if(!branch)
201         {
202                 result+=GROUP_END;
203                 write_int<Index>(this_group, result);
204         }
205
206         iter=end;
207
208         return result;
209 }
210
211 Regex::Code Regex::parse_atom(const string &expr, string::const_iterator &i, unsigned &group)
212 {
213         Code result;
214
215         if(i==expr.end())
216                 return result;
217
218         bool flag=false;
219         if(*i=='\\')
220         {
221                 if(++i==expr.end())
222                         throw InvalidParameterValue("Stray backslash");
223                 flag=true;
224         }
225
226         if(!flag)
227         {
228                 if(*i=='*' || *i=='{' || *i=='}' || *i=='+' || *i=='?' || *i=='|' || *i==')')
229                         throw InvalidParameterValue("Invalid atom");
230                 else if(*i=='[')
231                         return parse_brackets(expr, i);
232                 else if(*i=='.')
233                         result+=MATCH_ANY;
234                 else if(*i=='^')
235                         result+=MATCH_BEGIN;
236                 else if(*i=='$')
237                         result+=MATCH_END;
238                 else if(*i=='(')
239                 {
240                         ++group;
241                         result=compile(expr, ++i, group, false);
242                 }
243                 else
244                         flag=true;
245         }
246
247         if(flag)
248         {
249                 if(static_cast<unsigned char>(*i)<=LAST_INSTRUCTION_)
250                         result+=MATCH_CHAR;
251                 result+=*i;
252         }
253
254         ++i;
255
256         return result;
257 }
258
259 bool Regex::parse_repeat(string::const_iterator &i, Count &rmin, Count &rmax)
260 {
261         if(*i!='*' && *i!='+' && *i!='?' && *i!='{')
262                 return false;
263
264         if(*i=='*' || *i=='+')
265                 rmax=numeric_limits<Count>::max();
266         if(*i=='*' || *i=='?')
267                 rmin=0;
268         if(*i=='{')
269         {
270                 rmin=0;
271                 for(++i; isdigit(*i); ++i)
272                         rmin=rmin*10+(*i-'0');
273
274                 if(*i==',')
275                 {
276                         ++i;
277                         if(*i!='}')
278                         {
279                                 rmax=0;
280                                 for(; isdigit(*i); ++i)
281                                         rmax=rmax*10+(*i-'0');
282                                 if(rmax<rmin)
283                                         throw InvalidParameterValue("Invalid bound");
284                         }
285                         else
286                                 rmax=numeric_limits<Count>::max();
287                 }
288                 else
289                         rmax=rmin;
290                 if(*i!='}')
291                         throw InvalidParameterValue("Invalid bound");
292         }
293
294         ++i;
295
296         return true;
297 }
298
299 Regex::Code Regex::parse_brackets(const string &str, string::const_iterator &iter)
300 {
301         Code result;
302
303         ++iter;
304         bool neg=false;
305         if(*iter=='^')
306         {
307                 neg=true;
308                 ++iter;
309         }
310
311         string::const_iterator end=iter;
312         for(; (end!=str.end() && *end!=']'); ++end);
313         if(end==str.end())
314                 throw InvalidParameterValue("Unmatched '['");
315
316         uint8_t mask[32]={0};
317         unsigned type=0;
318         bool range=false;
319         unsigned char first, last;
320         for(string::const_iterator i=iter; i!=end; ++i)
321         {
322                 unsigned char c=*i;
323                 if(range)
324                 {
325                         last=c;
326                         for(unsigned j=first; j<=c; ++j)
327                                 mask[j>>3]|=1<<(j&7);
328                         range=false;
329                         if(type<2)
330                                 type=2;
331                 }
332                 else if(c=='-' && i!=iter && end-i>1)
333                         range=true;
334                 else
335                 {
336                         first=c;
337                         mask[c>>3]|=1<<(c&7);
338                         if(type==0)
339                                 type=1;
340                         else
341                                 type=3;
342                 }
343         }
344
345         if(neg)
346                 result+=NEGATE;
347
348         if(type==1)
349         {
350                 result+=MATCH_CHAR;
351                 result+=first;
352         }
353         else if(type==2)
354         {
355                 result+=MATCH_RANGE;
356                 result+=first;
357                 result+=last;
358         }
359         else
360         {
361                 result+=MATCH_MASK;
362                 result.append(reinterpret_cast<char *>(mask), 32);
363         }
364
365         iter=end;
366         ++iter;
367
368         return result;
369 }
370
371 bool Regex::run(const string &str, const string::const_iterator &begin, RegMatch::GroupArray &groups) const
372 {
373         bool result=false;
374         list<RunContext> ctx;
375         ctx.push_back(RunContext());
376         ctx.front().citer=code.begin();
377         ctx.front().groups.resize(groups.size());
378
379         for(string::const_iterator i=begin;;)
380         {
381                 int c;
382                 if(i!=str.end())
383                         c=static_cast<unsigned char>(*i);
384                 else
385                         c=-1;
386
387                 for(list<RunContext>::iterator j=ctx.begin(); j!=ctx.end();)
388                 {
389                         bool terminate=false;
390                         bool negate_match=false;
391                         for(; j->citer!=code.end();)
392                         {
393                                 Instruction instr=static_cast<Instruction>(*j->citer);
394                                 if(instr>LAST_INSTRUCTION_)
395                                         instr=MATCH_CHAR;
396                                 else
397                                         ++j->citer;
398
399                                 if(instr==NEGATE)
400                                         negate_match=true;
401                                 else if(instr==JUMP)
402                                 {
403                                         Offset offset=read_int<Offset>(j->citer);
404                                         j->citer+=offset;
405                                 }
406                                 else if(instr==ND_JUMP)
407                                 {
408                                         Offset offset=read_int<Offset>(j->citer);
409                                         ctx.push_back(*j);
410                                         ctx.back().citer+=offset;
411                                 }
412                                 else if(instr==GROUP_BEGIN)
413                                 {
414                                         Index n=read_int<Index>(j->citer);
415                                         if(!j->groups[n].match)
416                                                 j->groups[n].begin=i-str.begin();
417                                 }
418                                 else if(instr==GROUP_END)
419                                 {
420                                         Index n=read_int<Index>(j->citer);
421                                         if(!j->groups[n].match)
422                                         {
423                                                 j->groups[n].match=true;
424                                                 j->groups[n].end=i-str.begin();
425                                                 j->groups[n].length=j->groups[n].end-j->groups[n].begin;
426                                         }
427
428                                         if(n==0)
429                                         {
430                                                 result=true;
431                                                 bool better=false;
432                                                 for(unsigned k=0; (k<groups.size() && !better); ++k)
433                                                 {
434                                                         better=group_compare(j->groups[k], groups[k]);
435                                                         if(group_compare(groups[k], j->groups[k]))
436                                                                 break;
437                                                 }
438                                                 if(better)
439                                                         groups=j->groups;
440                                         }
441                                 }
442                                 else
443                                 {
444                                         bool match_result=false;
445                                         bool input_consumed=false;
446                                         if(instr==MATCH_BEGIN)
447                                                 match_result=(i==str.begin());
448                                         else if(instr==MATCH_END)
449                                                 match_result=(i==str.end());
450                                         else if(instr==MATCH_CHAR)
451                                         {
452                                                 match_result=(c==*j->citer++);
453                                                 input_consumed=true;
454                                         }
455                                         else if(instr==MATCH_RANGE)
456                                         {
457                                                 unsigned char first=*j->citer++;
458                                                 unsigned char last=*j->citer++;
459                                                 match_result=(c>=first && c<=last);
460                                                 input_consumed=true;
461                                         }
462                                         else if(instr==MATCH_MASK)
463                                         {
464                                                 uint8_t mask[32];
465                                                 for(unsigned k=0; k<32; ++k)
466                                                         mask[k]=*j->citer++;
467                                                 match_result=mask[c>>3]&(1<<(c&7));
468                                                 input_consumed=true;
469                                         }
470                                         else if(instr==MATCH_ANY)
471                                         {
472                                                 match_result=true;
473                                                 input_consumed=true;
474                                         }
475                                         else
476                                                 throw Exception("Invalid instruction");
477
478                                         if(match_result==negate_match)
479                                                 terminate=true;
480                                         negate_match=false;
481
482                                         if(input_consumed || terminate)
483                                                 break;
484                                 }
485                         }
486
487                         if(terminate || j->citer==code.end())
488                                 j=ctx.erase(j);
489                         else
490                                 ++j;
491                 }
492
493                 if(i==str.end() || ctx.empty())
494                         break;
495                 ++i;
496         }
497
498         return result;
499 }
500
501 bool Regex::group_compare(const RegMatch::Group &g1, const RegMatch::Group &g2) const
502 {
503         if(!g1.match)
504                 return false;
505         
506         // Any match is better than no match
507         if(!g2.match)
508                 return true;
509
510         // Earlier match is better
511         if(g1.begin<g2.begin)
512                 return true;
513         if(g2.begin>g2.begin)
514                 return false;
515
516         // Longer match at same position is better
517         return g1.end>g2.end;
518 }
519
520 string Regex::disassemble_instruction(Code::const_iterator &i) const
521 {
522         Instruction instr=static_cast<Instruction>(*i);
523         if(instr>=LAST_INSTRUCTION_)
524                 instr=MATCH_CHAR;
525         else
526                 ++i;
527
528         ostringstream result;
529         switch(instr)
530         {
531         case JUMP:
532                 {
533                         Offset offset=read_int<Offset>(i);
534                         result<<"JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
535                 }
536                 break;
537         case ND_JUMP:
538                 {
539                         Offset offset=read_int<Offset>(i);
540                         result<<"ND_JUMP "<<Fmt("%+d")<<offset<<" ("<<Fmt("%d")<<i-code.begin()+offset<<')';
541                 }
542                 break;
543         case GROUP_BEGIN:
544                 result<<"GROUP_BEGIN "<<read_int<Index>(i);
545                 break;
546         case GROUP_END:
547                 result<<"GROUP_END "<<read_int<Index>(i);
548                 break;
549         case NEGATE:
550                 result<<"NEGATE";
551                 break;
552         case MATCH_BEGIN:
553                 result<<"MATCH_BEGIN";
554                 break;
555         case MATCH_END:
556                 result<<"MATCH_END";
557                 break;
558         case MATCH_CHAR:
559                 {
560                         char c=*i++;
561                         result<<"MATCH_CHAR ";
562                         if(c>=0x20 && c<=0x7E)
563                                 result<<'\''<<c<<'\'';
564                         else
565                                 result<<(static_cast<int>(c)&0xFF);
566                 }
567                 break;
568         case MATCH_RANGE:
569                 result<<"MATCH_RANGE "<<(static_cast<int>(*i++)&0xFF);
570                 result<<'-'<<(static_cast<int>(*i++)&0xFF);
571                 break;
572         case MATCH_MASK:
573                 result<<"MATCH_MASK";
574                 for(unsigned j=0; j<32; ++j)
575                         result<<' '<<Fmt("%02X")<<(static_cast<int>(*i++)&0xFF);
576                 break;
577         case MATCH_ANY:
578                 result<<"MATCH_ANY";
579                 break;
580         case FIRST_INSTRUCTION_:
581         case LAST_INSTRUCTION_:
582                 result<<"UNKNOWN "<<instr;
583         }
584
585         return result.str();
586 }
587
588 } // namespace Msp