]> git.tdb.fi Git - libs/datafile.git/blob - source/textparser.cpp
Fix EOF handling
[libs/datafile.git] / source / textparser.cpp
1 /* $Id$
2
3 This file is part of libmspdatafile
4 Copyright © 2007  Mikko Rasa, Mikkosoft Productions
5 Distributed under the LGPL
6 */
7
8 #include <msp/strings/formatter.h>
9 #include <msp/strings/utils.h>
10 #include "input.h"
11 #include "textparser.h"
12 #include "token.h"
13
14 using namespace std;
15
16 namespace Msp {
17 namespace DataFile {
18
19 TextParser::TextParser(Input &i, const string &s):
20         ParserMode(i, s)
21 { }
22
23 Statement TextParser::parse()
24 {
25         return parse_statement(0);
26 }
27
28 Statement TextParser::parse_statement(const Token *t)
29 {
30         Statement result;
31         bool      sub=false;
32         bool      finish=false;
33
34         while(in)
35         {
36                 Token token;
37                 if(t)
38                 {
39                         token=*t;
40                         t=0;
41                 }
42                 else
43                         token=parse_token();
44
45                 if(result.keyword.empty())
46                 {
47                         if(token.str.empty())
48                                 break;
49                         else if(token.type!=Token::IDENTIFIER)
50                                 throw ParseError(format("%s: Syntax error at token '%s' (expected an identifier)", get_location(), token.str), src, in.get_line_number());
51                         result.keyword=token.str;
52                         result.valid=true;
53                         result.source=src;
54                         result.line=in.get_line_number();
55                 }
56                 else if(sub)
57                 {
58                         if(token.str=="}")
59                         {
60                                 sub=false;
61                                 finish=true;
62                         }
63                         else
64                         {
65                                 Statement ss=parse_statement(&token);
66                                 result.sub.push_back(ss);
67                         }
68                 }
69                 else if(finish)
70                 {
71                         if(token.str!=";")
72                                 throw ParseError(format("%s: Syntax error at token '%s' (Expected a ';')", get_location(), token.str), src, in.get_line_number());
73                         break;
74                 }
75                 else if(token.str=="{")
76                         sub=true;
77                 else if(token.str==";")
78                         break;
79                 else if(token.type==Token::INTEGER)
80                         result.args.push_back(Value(INTEGER, token.str));
81                 else if(token.type==Token::FLOAT)
82                         result.args.push_back(Value(FLOAT, token.str));
83                 else if(token.type==Token::STRING)
84                         result.args.push_back(Value(STRING, token.str));
85                 else if(token.type==Token::IDENTIFIER)
86                 {
87                         if(token.str=="true")
88                                 result.args.push_back(Value(BOOLEAN, "1"));
89                         else if(token.str=="false")
90                                 result.args.push_back(Value(BOOLEAN, "0"));
91                         else
92                                 result.args.push_back(Value(ENUM, token.str));
93                         //result.args.push_back(resolve_identifiertoken.str);
94                 }
95                 else if(token.str=="")
96                         throw ParseError(src+": Unexcepted end of input", src, in.get_line_number());
97                 else
98                         throw ParseError(get_location()+": Syntax error", src, in.get_line_number());
99         }
100
101         return result;
102 }
103
104 Token TextParser::parse_token()
105 {
106         int c=0;
107         int comment=0;
108
109         // Skip over comments and whitespace
110         while(in && comment>=0)
111         {
112                 c=in.get();
113                 int next=in.peek();
114
115                 if(c=='/' && next=='/')
116                         comment=1;
117                 else if(c=='/' && next=='*')
118                         comment=2;
119                 else if(c=='\n' && comment==1)
120                         comment=0;
121                 else if(c=='*' && next=='/' && comment==2)
122                         comment=3;
123                 else if(comment==3)   // Skip the second character of block comment end
124                         comment=0;
125                 else if(!isspace(c) && !comment)
126                         comment=-1;
127         }
128
129         if(comment>0)  // EOF while in comment
130                 throw ParseError(src+": Unfinished comment at end of input", src, in.get_line_number());
131         else if(comment==0)  // Didn't hit any non-whitespace
132                 return Token(Token::SPECIAL, "");
133
134         enum ParseState
135         {
136                 INIT,
137                 SIGN,
138                 FLOATEXPINIT,
139                 FLOATEXPSIGN,
140                 STRING,
141                 ACCEPT,
142                 ZERO,
143                 DECIMAL,
144                 HEXADECIMAL,
145                 OCTAL,
146                 FLOAT,
147                 FLOATEXP,
148                 IDENTIFIER
149         };
150
151         static Token::Type token_type[]=
152         {
153                 Token::SPECIAL,
154                 Token::SPECIAL,
155                 Token::SPECIAL,
156                 Token::SPECIAL,
157                 Token::STRING,
158                 Token::SPECIAL,
159                 Token::INTEGER,
160                 Token::INTEGER,
161                 Token::INTEGER,
162                 Token::INTEGER,
163                 Token::FLOAT,
164                 Token::FLOAT,
165                 Token::IDENTIFIER
166         };
167
168         ParseState state=INIT;
169         string     buf;
170         bool       escape=false;
171
172         while(in || state==INIT)
173         {
174                 if(state!=INIT)
175                         c=in.get();
176                 int next=in.peek();
177
178                 buf+=c;
179
180                 switch(state)
181                 {
182                 case INIT:
183                         if(c=='0')
184                                 state=ZERO;
185                         else if(c=='-' || c=='+')
186                                 state=SIGN;
187                         else if(c=='.')
188                                 state=FLOAT;
189                         else if(c=='"')
190                                 state=STRING;
191                         else if(c=='{' || c=='}' || c==';')
192                                 return Token(Token::SPECIAL, string(1, c));
193                         else if(isdigit(c))
194                                 state=DECIMAL;
195                         else if(isalpha(c) || c=='_')
196                                 state=IDENTIFIER;
197                         else
198                                 parse_error(c, state);
199                         break;
200
201                 case SIGN:
202                         if(c=='0')
203                                 state=ZERO;
204                         else if(isdigit(c))
205                                 state=DECIMAL;
206                         else if(c=='.')
207                                 state=FLOAT;
208                         else
209                                 parse_error(c, state);
210                         break;
211
212                 case ZERO:
213                         if(c=='x')
214                                 state=HEXADECIMAL;
215                         else if(isdigit(c))
216                                 state=OCTAL;
217                         else if(c=='.')
218                                 state=FLOAT;
219                         else
220                                 parse_error(c, state);
221                         break;
222
223                 case DECIMAL:
224                         if(c=='.')
225                                 state=FLOAT;
226                         else if(!isdigit(c))
227                                 parse_error(c, state);
228                         break;
229
230                 case HEXADECIMAL:
231                         if(!isxdigit(c))
232                                 parse_error(c, state);
233                         break;
234
235                 case OCTAL:
236                         if(!isodigit(c))
237                                 parse_error(c, state);
238                         break;
239
240                 case FLOAT:
241                         if(c=='e' || c=='E')
242                                 state=FLOATEXPINIT;
243                         else if(!isdigit(c))
244                                 parse_error(c, state);
245                         break;
246
247                 case FLOATEXPINIT:
248                         if(c=='+' || c=='-')
249                                 state=FLOATEXPSIGN;
250                         else if(isdigit(c))
251                                 state=FLOATEXP;
252                         else
253                                 parse_error(c, state);
254                         break;
255
256                 case FLOATEXPSIGN:
257                         if(isdigit(c))
258                                 state=FLOATEXP;
259                         else
260                                 parse_error(c, state);
261                         break;
262
263                 case FLOATEXP:
264                         if(!isdigit(c))
265                                 parse_error(c, state);
266                         break;
267
268                 case STRING:
269                         if(c=='\\')
270                                 escape=!escape;
271                         else if(c=='"' && !escape)
272                         {
273                                 try
274                                 {
275                                         return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
276                                 }
277                                 catch(const Exception &e)
278                                 {
279                                         throw ParseError(format("%s: %s", get_location(), e.what()), src, in.get_line_number());
280                                 }
281                         }
282                         else
283                                 escape=false;
284                         break;
285
286                 case IDENTIFIER:
287                         if(!isalpha(c) && !isdigit(c) && c!='_')
288                                 parse_error(c, state);
289                         break;
290
291                 default:
292                         throw Exception(get_location()+": Internal error (bad state)");
293                 }
294
295                 if(is_delimiter(next) && state>=ACCEPT)
296                         return Token(token_type[state], buf);
297         }
298
299         return Token(Token::SPECIAL, "");
300 }
301
302 bool TextParser::is_delimiter(int c)
303 {
304         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
305 }
306
307 bool TextParser::isodigit(int c)
308 {
309         return (c>='0' && c<='7');
310 }
311
312 string TextParser::get_location()
313 {
314         ostringstream ss;
315         ss<<src<<':'<<in.get_line_number();
316         return ss.str();
317 }
318
319 void TextParser::parse_error(int c, int state)
320 {
321         throw ParseError(format("%s: Parse error at '%c' (state %d)", get_location(), static_cast<char>(c), state), src, in.get_line_number());
322 }
323
324 } // namespace DataFile
325 } // namespace Msp