]> git.tdb.fi Git - libs/datafile.git/blob - source/textparser.cpp
Add binary data format
[libs/datafile.git] / source / textparser.cpp
1 /* $Id$
2
3 This file is part of libmspdatafile
4 Copyright © 2007  Mikko Rasa, Mikkosoft Productions
5 Distributed under the LGPL
6 */
7
8 #include <msp/strings/formatter.h>
9 #include "input.h"
10 #include "textparser.h"
11 #include "token.h"
12
13 using namespace std;
14
15 namespace Msp {
16 namespace DataFile {
17
18 TextParser::TextParser(Input &i, const string &s):
19         ParserMode(i, s)
20 { }
21
22 Statement TextParser::parse()
23 {
24         return parse_statement(0);
25 }
26
27 Statement TextParser::parse_statement(const Token *t)
28 {
29         Statement result;
30         bool      sub=false;
31         bool      finish=false;
32
33         while(in)
34         {
35                 Token token;
36                 if(t)
37                 {
38                         token=*t;
39                         t=0;
40                 }
41                 else
42                         token=parse_token();
43
44                 if(result.keyword.empty())
45                 {
46                         if(token.str.empty())
47                                 break;
48                         else if(token.type!=Token::IDENTIFIER)
49                                 throw ParseError(format("%s: Syntax error at token '%s' (expected an identifier)", get_location(), token.str), src, in.get_line_number());
50                         result.keyword=token.str;
51                         result.valid=true;
52                         result.source=src;
53                         result.line=in.get_line_number();
54                 }
55                 else if(sub)
56                 {
57                         if(token.str=="}")
58                         {
59                                 sub=false;
60                                 finish=true;
61                         }
62                         else
63                         {
64                                 Statement ss=parse_statement(&token);
65                                 result.sub.push_back(ss);
66                         }
67                 }
68                 else if(finish)
69                 {
70                         if(token.str!=";")
71                                 throw ParseError(format("%s: Syntax error at token '%s' (Expected a ';')", get_location(), token.str), src, in.get_line_number());
72                         break;
73                 }
74                 else if(token.str=="{")
75                         sub=true;
76                 else if(token.str==";")
77                         break;
78                 else if(token.type==Token::INTEGER)
79                         result.args.push_back(Value(INTEGER, token.str));
80                 else if(token.type==Token::FLOAT)
81                         result.args.push_back(Value(FLOAT, token.str));
82                 else if(token.type==Token::STRING)
83                         result.args.push_back(Value(STRING, token.str));
84                 else if(token.type==Token::IDENTIFIER)
85                 {
86                         if(token.str=="true")
87                                 result.args.push_back(Value(BOOLEAN, "1"));
88                         else if(token.str=="false")
89                                 result.args.push_back(Value(BOOLEAN, "0"));
90                         else
91                                 result.args.push_back(Value(ENUM, token.str));
92                         //result.args.push_back(resolve_identifiertoken.str);
93                 }
94                 else if(token.str=="")
95                         throw ParseError(src+": Unexcepted end of input", src, in.get_line_number());
96                 else
97                         throw ParseError(get_location()+": Syntax error", src, in.get_line_number());
98         }
99
100         return result;
101 }
102
103 Token TextParser::parse_token()
104 {
105         int c=0;
106         unsigned comment=0;
107
108         // Skip over comments and whitespace
109         while(in)
110         {
111                 c=in.get();
112                 int next=in.peek();
113
114                 if(c=='/' && next=='/')
115                         comment=1;
116                 else if(c=='/' && next=='*')
117                         comment=2;
118                 else if(c=='\n' && comment==1)
119                         comment=0;
120                 else if(c=='*' && next=='/' && comment==2)
121                         comment=3;
122                 else if(comment==3)   // Skip the second character of block comment end
123                         comment=0;
124                 else if(!isspace(c) && !comment)
125                         break;
126         }
127
128         if(comment)  // Didn't hit any non-whitespace
129                 throw ParseError(src+": Unfinished comment at end of input", src, in.get_line_number());
130
131         enum ParseState
132         {
133                 INIT,
134                 SIGN,
135                 FLOATEXPINIT,
136                 FLOATEXPSIGN,
137                 STRING,
138                 ACCEPT,
139                 ZERO,
140                 DECIMAL,
141                 HEXADECIMAL,
142                 OCTAL,
143                 FLOAT,
144                 FLOATEXP,
145                 IDENTIFIER
146         };
147
148         static Token::Type token_type[]=
149         {
150                 Token::SPECIAL,
151                 Token::SPECIAL,
152                 Token::SPECIAL,
153                 Token::SPECIAL,
154                 Token::STRING,
155                 Token::SPECIAL,
156                 Token::INTEGER,
157                 Token::INTEGER,
158                 Token::INTEGER,
159                 Token::INTEGER,
160                 Token::FLOAT,
161                 Token::FLOAT,
162                 Token::IDENTIFIER
163         };
164
165         ParseState state=INIT;
166         string     buf;
167         bool       escape=false;
168
169         while(in)
170         {
171                 if(state!=INIT)
172                         c=in.get();
173                 int next=in.peek();
174
175                 buf+=c;
176
177                 switch(state)
178                 {
179                 case INIT:
180                         if(c=='0')
181                                 state=ZERO;
182                         else if(c=='-' || c=='+')
183                                 state=SIGN;
184                         else if(c=='.')
185                                 state=FLOAT;
186                         else if(c=='"')
187                                 state=STRING;
188                         else if(c=='{' || c=='}' || c==';')
189                                 return Token(Token::SPECIAL, string(1, c));
190                         else if(isdigit(c))
191                                 state=DECIMAL;
192                         else if(isalpha(c) || c=='_')
193                                 state=IDENTIFIER;
194                         else
195                                 parse_error(c, state);
196                         break;
197
198                 case SIGN:
199                         if(c=='0')
200                                 state=ZERO;
201                         else if(isdigit(c))
202                                 state=DECIMAL;
203                         else if(c=='.')
204                                 state=FLOAT;
205                         else
206                                 parse_error(c, state);
207                         break;
208
209                 case ZERO:
210                         if(c=='x')
211                                 state=HEXADECIMAL;
212                         else if(isdigit(c))
213                                 state=OCTAL;
214                         else if(c=='.')
215                                 state=FLOAT;
216                         else
217                                 parse_error(c, state);
218                         break;
219
220                 case DECIMAL:
221                         if(c=='.')
222                                 state=FLOAT;
223                         else if(!isdigit(c))
224                                 parse_error(c, state);
225                         break;
226
227                 case HEXADECIMAL:
228                         if(!isxdigit(c))
229                                 parse_error(c, state);
230                         break;
231
232                 case OCTAL:
233                         if(!isodigit(c))
234                                 parse_error(c, state);
235                         break;
236
237                 case FLOAT:
238                         if(c=='e' || c=='E')
239                                 state=FLOATEXPINIT;
240                         else if(!isdigit(c))
241                                 parse_error(c, state);
242                         break;
243
244                 case FLOATEXPINIT:
245                         if(c=='+' || c=='-')
246                                 state=FLOATEXPSIGN;
247                         else if(isdigit(c))
248                                 state=FLOATEXP;
249                         else
250                                 parse_error(c, state);
251                         break;
252
253                 case FLOATEXPSIGN:
254                         if(isdigit(c))
255                                 state=FLOATEXP;
256                         else
257                                 parse_error(c, state);
258                         break;
259
260                 case FLOATEXP:
261                         if(!isdigit(c))
262                                 parse_error(c, state);
263                         break;
264
265                 case STRING:
266                         if(c=='\\')
267                                 escape=!escape;
268                         else if(c=='"' && !escape)
269                                 return Token(Token::STRING, unescape_string(buf));
270                         else
271                                 escape=false;
272                         break;
273
274                 case IDENTIFIER:
275                         if(!isalpha(c) && !isdigit(c) && c!='_')
276                                 parse_error(c, state);
277                         break;
278
279                 default:
280                         throw Exception(get_location()+": Internal error (bad state)");
281                 }
282
283                 if(is_delimiter(next) && state>=ACCEPT)
284                         return Token(token_type[state], buf);
285         }
286
287         return Token(Token::SPECIAL, "");
288 }
289
290 bool TextParser::is_delimiter(int c)
291 {
292         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
293 }
294
295 bool TextParser::isodigit(int c)
296 {
297         return (c>='0' && c<='7');
298 }
299
300 string TextParser::unescape_string(const string &str)
301 {
302         string   result;
303         bool     escape=false;
304         unsigned hexcape=0;
305         for(string::const_iterator i=str.begin()+1; i!=str.end()-1; ++i)
306         {
307                 if(escape)
308                 {
309                         if(*i=='n')
310                                 result+='\n';
311                         else if(*i=='t')
312                                 result+='\t';
313                         else if(*i=='\\')
314                                 result+='\\';
315                         else if(*i=='"')
316                                 result+='"';
317                         else if(*i=='x')
318                                 hexcape=0x100;
319                         else
320                                 throw ParseError(format("%s: Invalid escape sequence '\\%c'", get_location(), *i), src, in.get_line_number());
321                         escape=false;
322                 }
323                 else if(hexcape)
324                 {
325                         unsigned digit=0;
326                         if(*i>='0' && *i<='9')
327                                 digit=*i-'0';
328                         else if(*i>='a' && *i<='f')
329                                 digit=*i-'a'+10;
330                         else if(*i>='A' && *i<='F')
331                                 digit=*i-'A'+10;
332                         else
333                                 throw ParseError(get_location()+": Invalid hex digit", src, in.get_line_number());
334
335                         hexcape=(hexcape<<4)|digit;
336                         if(hexcape&0x10000)
337                         {
338                                 result+=hexcape&0xFF;
339                                 hexcape=0;
340                         }
341                 }
342                 else if(*i=='\\')
343                         escape=true;
344                 else
345                         result+=*i;
346         }
347
348         return result;
349 }
350
351 string TextParser::get_location()
352 {
353         ostringstream ss;
354         ss<<src<<':'<<in.get_line_number();
355         return ss.str();
356 }
357
358 void TextParser::parse_error(int c, int state)
359 {
360         throw ParseError(format("%s: Parse error at '%c' (state %d)", get_location(), static_cast<char>(c), state), src, in.get_line_number());
361 }
362
363 } // namespace DataFile
364 } // namespace Msp