Some refactoring of TextParser logic
[libs/datafile.git] / source / textparser.cpp
1 #include <msp/strings/format.h>
2 #include <msp/strings/utils.h>
3 #include "except.h"
4 #include "input.h"
5 #include "textparser.h"
6 #include "token.h"
7
8 using namespace std;
9
10 namespace Msp {
11 namespace DataFile {
12
13 TextParser::TextParser(Input &i, const string &s):
14         ParserMode(i, s)
15 { }
16
17 Statement TextParser::parse()
18 {
19         return parse_statement(0);
20 }
21
22 Statement TextParser::parse_statement(const Token *t)
23 {
24         Statement result;
25         unsigned sub = 0;
26
27         while(in)
28         {
29                 Token token;
30                 if(t)
31                 {
32                         token = *t;
33                         t = 0;
34                 }
35                 else
36                         token = parse_token();
37
38                 if(result.keyword.empty())
39                 {
40                         if(token.str.empty())
41                                 break;
42                         else if(token.type!=Token::IDENTIFIER)
43                                 throw syntax_error(token.str);
44                         result.keyword = token.str;
45                         result.valid = true;
46                         result.source = src;
47                         result.line = in.get_line_number();
48                 }
49                 else if(sub==1)
50                 {
51                         if(token.str=="}")
52                                 sub = 2;
53                         else
54                         {
55                                 Statement ss = parse_statement(&token);
56                                 result.sub.push_back(ss);
57                         }
58                 }
59                 else if(sub==2)
60                 {
61                         if(token.str!=";")
62                                 throw syntax_error(token.str);
63                         break;
64                 }
65                 else if(token.str=="{")
66                         sub = 1;
67                 else if(token.str==";")
68                         break;
69                 else if(token.type==Token::INTEGER)
70                         result.append(lexical_cast<IntType::Store>(token.str));
71                 else if(token.type==Token::FLOAT)
72                         result.append(lexical_cast<FloatType::Store>(token.str));
73                 else if(token.type==Token::STRING)
74                         result.append(token.str);
75                 else if(token.type==Token::IDENTIFIER)
76                 {
77                         if(token.str=="true")
78                                 result.append(true);
79                         else if(token.str=="false")
80                                 result.append(false);
81                         else
82                                 result.append(Symbol(token.str));
83                 }
84                 else
85                         throw syntax_error(token.str);
86         }
87
88         return result;
89 }
90
91 Token TextParser::parse_token()
92 {
93         int c = 0;
94         int comment = 0;
95
96         // Skip over comments and whitespace
97         while(in && comment>=0)
98         {
99                 c = in.get();
100                 int next = in.peek();
101
102                 if(c=='/' && next=='/' && !comment)
103                         comment = 1;
104                 else if(c=='/' && next=='*' && !comment)
105                         comment = 2;
106                 else if(c=='\n' && comment==1)
107                         comment = 0;
108                 else if(c=='*' && next=='/' && comment==2)
109                         comment = 3;
110                 else if(comment==3)   // Skip the second character of block comment end
111                         comment = 0;
112                 else if(c!=-1 && !isspace(c) && !comment)
113                         comment = -1;
114         }
115
116         if(comment>0)  // EOF while in comment
117                 throw parse_error(string());
118         else if(comment==0)  // Didn't hit any non-whitespace
119                 return Token(Token::SPECIAL, "");
120
121         enum ParseState
122         {
123                 INIT,
124                 SIGN,
125                 FLOATEXPINIT,
126                 FLOATEXPSIGN,
127                 STRING,
128                 STRING_ESCAPE,
129                 ACCEPT,
130                 ZERO,
131                 DECIMAL,
132                 HEXADECIMAL,
133                 OCTAL,
134                 FLOAT,
135                 FLOATEXP,
136                 STRING_END,
137                 IDENTIFIER
138         };
139
140         static Token::Type token_type[]=
141         {
142                 Token::SPECIAL,
143                 Token::SPECIAL,
144                 Token::SPECIAL,
145                 Token::SPECIAL,
146                 Token::SPECIAL,
147                 Token::SPECIAL,
148                 Token::SPECIAL,
149                 Token::INTEGER,
150                 Token::INTEGER,
151                 Token::INTEGER,
152                 Token::INTEGER,
153                 Token::FLOAT,
154                 Token::FLOAT,
155                 Token::STRING,
156                 Token::IDENTIFIER
157         };
158
159         ParseState state = INIT;
160         string buf;
161
162         while(in || state==INIT)
163         {
164                 if(state!=INIT)
165                         c = in.get();
166                 int next = in.peek();
167
168                 buf += c;
169
170                 switch(state)
171                 {
172                 case INIT:
173                         if(c=='0')
174                                 state = ZERO;
175                         else if(c=='-' || c=='+')
176                                 state = SIGN;
177                         else if(c=='.')
178                                 state = FLOAT;
179                         else if(c=='"')
180                                 state = STRING;
181                         else if(c=='{' || c=='}' || c==';')
182                                 return Token(Token::SPECIAL, string(1, c));
183                         else if(isdigit(c))
184                                 state = DECIMAL;
185                         else if(isalpha(c) || c=='_' || c=='\\')
186                                 state = IDENTIFIER;
187                         else
188                                 throw parse_error(buf);
189                         break;
190
191                 case SIGN:
192                         if(c=='0')
193                                 state = ZERO;
194                         else if(isdigit(c))
195                                 state = DECIMAL;
196                         else if(c=='.')
197                                 state = FLOAT;
198                         else
199                                 throw parse_error(buf);
200                         break;
201
202                 case ZERO:
203                         if(c=='x')
204                                 state = HEXADECIMAL;
205                         else if(isdigit(c))
206                                 state = OCTAL;
207                         else if(c=='.')
208                                 state = FLOAT;
209                         else
210                                 throw parse_error(buf);
211                         break;
212
213                 case DECIMAL:
214                         if(c=='.')
215                                 state = FLOAT;
216                         else if(c=='e' || c=='E')
217                                 state = FLOATEXPINIT;
218                         else if(!isdigit(c))
219                                 throw parse_error(buf);
220                         break;
221
222                 case HEXADECIMAL:
223                         if(!isxdigit(c))
224                                 throw parse_error(buf);
225                         break;
226
227                 case OCTAL:
228                         if(!isodigit(c))
229                                 throw parse_error(buf);
230                         break;
231
232                 case FLOAT:
233                         if(c=='e' || c=='E')
234                                 state = FLOATEXPINIT;
235                         else if(!isdigit(c))
236                                 throw parse_error(buf);
237                         break;
238
239                 case FLOATEXPINIT:
240                         if(c=='+' || c=='-')
241                                 state = FLOATEXPSIGN;
242                         else if(isdigit(c))
243                                 state = FLOATEXP;
244                         else
245                                 throw parse_error(buf);
246                         break;
247
248                 case FLOATEXPSIGN:
249                         if(isdigit(c))
250                                 state = FLOATEXP;
251                         else
252                                 throw parse_error(buf);
253                         break;
254
255                 case FLOATEXP:
256                         if(!isdigit(c))
257                                 throw parse_error(buf);
258                         break;
259
260                 case STRING:
261                         if(c=='\\')
262                                 state = STRING_ESCAPE;
263                         else if(c=='"')
264                                 state = STRING_END;
265                         break;
266
267                 case STRING_ESCAPE:
268                         state = STRING;
269                         break;
270
271                 case IDENTIFIER:
272                         if(!isalpha(c) && !isdigit(c) && c!='_' && c!='-' && c!='/')
273                                 throw parse_error(buf);
274                         break;
275
276                 case STRING_END:
277                         throw parse_error(buf);
278
279                 default:
280                         throw logic_error("bad parser state");
281                 }
282
283                 if(is_delimiter(next) && state>=ACCEPT)
284                 {
285                         if(state==IDENTIFIER && buf[0]=='\\')
286                                 return Token(Token::IDENTIFIER, buf.substr(1));
287                         else if(state==STRING_END)
288                                 return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
289                         else
290                                 return Token(token_type[state], buf);
291                 }
292         }
293
294         return Token(Token::SPECIAL, "");
295 }
296
297 bool TextParser::is_delimiter(int c)
298 {
299         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
300 }
301
302 bool TextParser::isodigit(int c)
303 {
304         return (c>='0' && c<='7');
305 }
306
307 } // namespace DataFile
308 } // namespace Msp