]> git.tdb.fi Git - libs/datafile.git/blob - source/textparser.cpp
Don't throw on empty files
[libs/datafile.git] / source / textparser.cpp
1 /* $Id$
2
3 This file is part of libmspdatafile
4 Copyright © 2007-2008, 2010  Mikko Rasa, Mikkosoft Productions
5 Distributed under the LGPL
6 */
7
8 #include <msp/strings/formatter.h>
9 #include <msp/strings/utils.h>
10 #include "input.h"
11 #include "textparser.h"
12 #include "token.h"
13
14 using namespace std;
15
16 namespace Msp {
17 namespace DataFile {
18
19 TextParser::TextParser(Input &i, const string &s):
20         ParserMode(i, s)
21 { }
22
23 Statement TextParser::parse()
24 {
25         return parse_statement(0);
26 }
27
28 Statement TextParser::parse_statement(const Token *t)
29 {
30         Statement result;
31         bool      sub = false;
32         bool      finish = false;
33
34         while(in)
35         {
36                 Token token;
37                 if(t)
38                 {
39                         token = *t;
40                         t = 0;
41                 }
42                 else
43                         token = parse_token();
44
45                 if(result.keyword.empty())
46                 {
47                         if(token.str.empty())
48                                 break;
49                         else if(token.type!=Token::IDENTIFIER)
50                                 throw_at(ParseError(format("Syntax error at token '%s' (expected an identifier)", token.str)), get_location());
51                         result.keyword = token.str;
52                         result.valid = true;
53                         result.source = src;
54                         result.line = in.get_line_number();
55                 }
56                 else if(sub)
57                 {
58                         if(token.str=="}")
59                         {
60                                 sub = false;
61                                 finish = true;
62                         }
63                         else
64                         {
65                                 Statement ss = parse_statement(&token);
66                                 result.sub.push_back(ss);
67                         }
68                 }
69                 else if(finish)
70                 {
71                         if(token.str!=";")
72                                 throw_at(ParseError(format("Syntax error at token '%s' (Expected a ';')", token.str)), get_location());
73                         break;
74                 }
75                 else if(token.str=="{")
76                         sub = true;
77                 else if(token.str==";")
78                         break;
79                 else if(token.type==Token::INTEGER)
80                         result.append(lexical_cast<IntType::Store>(token.str));
81                 else if(token.type==Token::FLOAT)
82                         result.append(lexical_cast<FloatType::Store>(token.str));
83                 else if(token.type==Token::STRING)
84                         result.append(token.str);
85                 else if(token.type==Token::IDENTIFIER)
86                 {
87                         if(token.str=="true")
88                                 result.append(true);
89                         else if(token.str=="false")
90                                 result.append(false);
91                         else
92                                 result.append(Symbol(token.str));
93                 }
94                 else if(token.str=="")
95                         throw_at(ParseError("Unexcepted end of input"), get_location());
96                 else
97                         throw_at(ParseError("Syntax error"), get_location());
98         }
99
100         return result;
101 }
102
103 Token TextParser::parse_token()
104 {
105         int c = 0;
106         int comment = 0;
107
108         // Skip over comments and whitespace
109         while(in && comment>=0)
110         {
111                 c = in.get();
112                 int next = in.peek();
113
114                 if(c=='/' && next=='/')
115                         comment = 1;
116                 else if(c=='/' && next=='*')
117                         comment = 2;
118                 else if(c=='\n' && comment==1)
119                         comment = 0;
120                 else if(c=='*' && next=='/' && comment==2)
121                         comment = 3;
122                 else if(comment==3)   // Skip the second character of block comment end
123                         comment = 0;
124                 else if(c!=-1 && !isspace(c) && !comment)
125                         comment = -1;
126         }
127
128         if(comment>0)  // EOF while in comment
129                 throw_at(ParseError("Unfinished comment at end of input"), get_location());
130         else if(comment==0)  // Didn't hit any non-whitespace
131                 return Token(Token::SPECIAL, "");
132
133         enum ParseState
134         {
135                 INIT,
136                 SIGN,
137                 FLOATEXPINIT,
138                 FLOATEXPSIGN,
139                 STRING,
140                 ACCEPT,
141                 ZERO,
142                 DECIMAL,
143                 HEXADECIMAL,
144                 OCTAL,
145                 FLOAT,
146                 FLOATEXP,
147                 IDENTIFIER
148         };
149
150         static Token::Type token_type[]=
151         {
152                 Token::SPECIAL,
153                 Token::SPECIAL,
154                 Token::SPECIAL,
155                 Token::SPECIAL,
156                 Token::STRING,
157                 Token::SPECIAL,
158                 Token::INTEGER,
159                 Token::INTEGER,
160                 Token::INTEGER,
161                 Token::INTEGER,
162                 Token::FLOAT,
163                 Token::FLOAT,
164                 Token::IDENTIFIER
165         };
166
167         ParseState state = INIT;
168         string     buf;
169         bool       escape = false;
170
171         while(in || state==INIT)
172         {
173                 if(state!=INIT)
174                         c = in.get();
175                 int next = in.peek();
176
177                 buf += c;
178
179                 switch(state)
180                 {
181                 case INIT:
182                         if(c=='0')
183                                 state = ZERO;
184                         else if(c=='-' || c=='+')
185                                 state = SIGN;
186                         else if(c=='.')
187                                 state = FLOAT;
188                         else if(c=='"')
189                                 state = STRING;
190                         else if(c=='{' || c=='}' || c==';')
191                                 return Token(Token::SPECIAL, string(1, c));
192                         else if(isdigit(c))
193                                 state = DECIMAL;
194                         else if(isalpha(c) || c=='_')
195                                 state = IDENTIFIER;
196                         else
197                                 parse_error(c, "0-9A-Za-z_.\"{};+-");
198                         break;
199
200                 case SIGN:
201                         if(c=='0')
202                                 state = ZERO;
203                         else if(isdigit(c))
204                                 state = DECIMAL;
205                         else if(c=='.')
206                                 state = FLOAT;
207                         else
208                                 parse_error(c, "0-9.");
209                         break;
210
211                 case ZERO:
212                         if(c=='x')
213                                 state = HEXADECIMAL;
214                         else if(isdigit(c))
215                                 state = OCTAL;
216                         else if(c=='.')
217                                 state = FLOAT;
218                         else
219                                 parse_error(c, "0-9A-Fa-f.");
220                         break;
221
222                 case DECIMAL:
223                         if(c=='.')
224                                 state = FLOAT;
225                         else if(!isdigit(c))
226                                 parse_error(c, "0-9.");
227                         break;
228
229                 case HEXADECIMAL:
230                         if(!isxdigit(c))
231                                 parse_error(c, "0-9A-Fa-f");
232                         break;
233
234                 case OCTAL:
235                         if(!isodigit(c))
236                                 parse_error(c, "0-7");
237                         break;
238
239                 case FLOAT:
240                         if(c=='e' || c=='E')
241                                 state = FLOATEXPINIT;
242                         else if(!isdigit(c))
243                                 parse_error(c, "0-9Ee");
244                         break;
245
246                 case FLOATEXPINIT:
247                         if(c=='+' || c=='-')
248                                 state = FLOATEXPSIGN;
249                         else if(isdigit(c))
250                                 state = FLOATEXP;
251                         else
252                                 parse_error(c, "0-9+-");
253                         break;
254
255                 case FLOATEXPSIGN:
256                         if(isdigit(c))
257                                 state = FLOATEXP;
258                         else
259                                 parse_error(c, "0-9");
260                         break;
261
262                 case FLOATEXP:
263                         if(!isdigit(c))
264                                 parse_error(c, "0-9");
265                         break;
266
267                 case STRING:
268                         if(c=='\\')
269                                 escape = !escape;
270                         else if(c=='"' && !escape)
271                         {
272                                 try
273                                 {
274                                         return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
275                                 }
276                                 catch(Exception &e)
277                                 {
278                                         e.at(get_location());
279                                         throw;
280                                 }
281                         }
282                         else
283                                 escape = false;
284                         break;
285
286                 case IDENTIFIER:
287                         if(!isalpha(c) && !isdigit(c) && c!='_')
288                                 parse_error(c, "0-9A-Za-z_");
289                         break;
290
291                 default:
292                         throw_at(InvalidState("Internal error (bad state)"), get_location());
293                 }
294
295                 if(is_delimiter(next) && state>=ACCEPT)
296                         return Token(token_type[state], buf);
297         }
298
299         return Token(Token::SPECIAL, "");
300 }
301
302 bool TextParser::is_delimiter(int c)
303 {
304         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
305 }
306
307 bool TextParser::isodigit(int c)
308 {
309         return (c>='0' && c<='7');
310 }
311
312 string TextParser::get_location()
313 {
314         ostringstream ss;
315         ss<<src<<':'<<in.get_line_number();
316         return ss.str();
317 }
318
319 void TextParser::parse_error(int c, const char *e)
320 {
321         throw_at(ParseError(format("Parse error at '%c', expected one of \"%s\"", static_cast<char>(c), e)), get_location());
322 }
323
324 } // namespace DataFile
325 } // namespace Msp