]> git.tdb.fi Git - libs/datafile.git/blob - source/textparser.cpp
Cosmetic changes
[libs/datafile.git] / source / textparser.cpp
1 #include <msp/strings/format.h>
2 #include <msp/strings/utils.h>
3 #include "except.h"
4 #include "input.h"
5 #include "textparser.h"
6 #include "token.h"
7
8 using namespace std;
9
10 namespace Msp {
11 namespace DataFile {
12
13 TextParser::TextParser(Input &i, const string &s):
14         ParserMode(i, s)
15 { }
16
17 Statement TextParser::parse()
18 {
19         return parse_statement(nullptr);
20 }
21
22 Statement TextParser::parse_statement(const Token *t)
23 {
24         Statement result;
25         unsigned sub = 0;
26
27         while(in)
28         {
29                 Token token;
30                 if(t)
31                 {
32                         token = *t;
33                         t = nullptr;
34                 }
35                 else
36                         token = parse_token();
37
38                 if(result.keyword.empty())
39                 {
40                         if(token.str.empty())
41                                 break;
42                         else if(token.type!=Token::IDENTIFIER)
43                                 throw syntax_error(token.str);
44                         result.keyword = token.str;
45                         result.valid = true;
46                         result.source = src;
47                         result.line = in.get_line_number();
48                 }
49                 else if(sub==1)
50                 {
51                         if(token.str=="}")
52                                 sub = 2;
53                         else
54                         {
55                                 Statement ss = parse_statement(&token);
56                                 result.sub.push_back(ss);
57                         }
58                 }
59                 else if(sub==2)
60                 {
61                         if(token.str!=";")
62                                 throw syntax_error(token.str);
63                         break;
64                 }
65                 else if(token.str=="{")
66                         sub = 1;
67                 else if(token.str==";")
68                         break;
69                 else if(token.type!=Token::SPECIAL)
70                         result.append_from_token(token);
71                 else
72                         throw syntax_error(token.str);
73         }
74
75         return result;
76 }
77
78 Token TextParser::parse_token()
79 {
80         int c = 0;
81         int comment = 0;
82
83         // Skip over comments and whitespace
84         while(in && comment>=0)
85         {
86                 c = in.get();
87                 int next = in.peek();
88
89                 if(c=='/' && next=='/' && !comment)
90                         comment = 1;
91                 else if(c=='/' && next=='*' && !comment)
92                         comment = 2;
93                 else if(c=='\n' && comment==1)
94                         comment = 0;
95                 else if(c=='*' && next=='/' && comment==2)
96                         comment = 3;
97                 else if(comment==3)   // Skip the second character of block comment end
98                         comment = 0;
99                 else if(c!=-1 && !isspace(c) && !comment)
100                         comment = -1;
101         }
102
103         if(comment>0)  // EOF while in comment
104                 throw parse_error(string());
105         else if(comment==0)  // Didn't hit any non-whitespace
106                 return Token(Token::SPECIAL, "");
107
108         enum ParseState
109         {
110                 INIT,
111                 SIGN,
112                 FLOATEXPINIT,
113                 FLOATEXPSIGN,
114                 STRING,
115                 STRING_ESCAPE,
116                 STRING_BASE64,
117                 ACCEPT,
118                 ZERO,
119                 DECIMAL,
120                 HEXADECIMAL,
121                 OCTAL,
122                 FLOAT,
123                 FLOATEXP,
124                 STRING_END,
125                 STRING_BASE64_PAD,
126                 IDENTIFIER
127         };
128
129         static Token::Type token_type[]=
130         {
131                 Token::SPECIAL,
132                 Token::SPECIAL,
133                 Token::SPECIAL,
134                 Token::SPECIAL,
135                 Token::SPECIAL,
136                 Token::SPECIAL,
137                 Token::SPECIAL,
138                 Token::SPECIAL,
139                 Token::INTEGER,
140                 Token::INTEGER,
141                 Token::INTEGER,
142                 Token::INTEGER,
143                 Token::FLOAT,
144                 Token::FLOAT,
145                 Token::STRING,
146                 Token::STRING,
147                 Token::IDENTIFIER
148         };
149
150         ParseState state = INIT;
151         string buf;
152
153         while(in || state==INIT)
154         {
155                 if(state!=INIT)
156                         c = in.get();
157                 int next = in.peek();
158
159                 buf += c;
160
161                 switch(state)
162                 {
163                 case INIT:
164                         if(c=='0')
165                                 state = ZERO;
166                         else if(c=='-' || c=='+')
167                                 state = SIGN;
168                         else if(c=='.')
169                                 state = FLOAT;
170                         else if(c=='"')
171                                 state = STRING;
172                         else if(c=='=')
173                                 state = STRING_BASE64;
174                         else if(c=='{' || c=='}' || c==';')
175                                 return Token(Token::SPECIAL, string(1, c));
176                         else if(isdigit(c))
177                                 state = DECIMAL;
178                         else if(isalpha(c) || c=='_' || c=='\\')
179                                 state = IDENTIFIER;
180                         else
181                                 throw parse_error(buf);
182                         break;
183
184                 case SIGN:
185                         if(c=='0')
186                                 state = ZERO;
187                         else if(isdigit(c))
188                                 state = DECIMAL;
189                         else if(c=='.')
190                                 state = FLOAT;
191                         else
192                                 throw parse_error(buf);
193                         break;
194
195                 case ZERO:
196                         if(c=='x')
197                                 state = HEXADECIMAL;
198                         else if(isdigit(c))
199                                 state = OCTAL;
200                         else if(c=='.')
201                                 state = FLOAT;
202                         else
203                                 throw parse_error(buf);
204                         break;
205
206                 case DECIMAL:
207                         if(c=='.')
208                                 state = FLOAT;
209                         else if(c=='e' || c=='E')
210                                 state = FLOATEXPINIT;
211                         else if(!isdigit(c))
212                                 throw parse_error(buf);
213                         break;
214
215                 case HEXADECIMAL:
216                         if(!isxdigit(c))
217                                 throw parse_error(buf);
218                         break;
219
220                 case OCTAL:
221                         if(!isodigit(c))
222                                 throw parse_error(buf);
223                         break;
224
225                 case FLOAT:
226                         if(c=='e' || c=='E')
227                                 state = FLOATEXPINIT;
228                         else if(!isdigit(c))
229                                 throw parse_error(buf);
230                         break;
231
232                 case FLOATEXPINIT:
233                         if(c=='+' || c=='-')
234                                 state = FLOATEXPSIGN;
235                         else if(isdigit(c))
236                                 state = FLOATEXP;
237                         else
238                                 throw parse_error(buf);
239                         break;
240
241                 case FLOATEXPSIGN:
242                         if(isdigit(c))
243                                 state = FLOATEXP;
244                         else
245                                 throw parse_error(buf);
246                         break;
247
248                 case FLOATEXP:
249                         if(!isdigit(c))
250                                 throw parse_error(buf);
251                         break;
252
253                 case STRING:
254                         if(c=='\\')
255                                 state = STRING_ESCAPE;
256                         else if(c=='"')
257                                 state = STRING_END;
258                         break;
259
260                 case STRING_ESCAPE:
261                         state = STRING;
262                         break;
263
264                 case STRING_BASE64:
265                         if(c=='=')
266                                 state = STRING_BASE64_PAD;
267                         else if(!isalnum(c) && c!='+' && c!='/')
268                                 throw parse_error(buf);
269                         break;
270
271                 case STRING_BASE64_PAD:
272                         if(c!='=')
273                                 throw parse_error(buf);
274                         break;
275
276                 case IDENTIFIER:
277                         if(!isalpha(c) && !isdigit(c) && c!='_' && c!='-' && c!='/')
278                                 throw parse_error(buf);
279                         break;
280
281                 case STRING_END:
282                         throw parse_error(buf);
283
284                 default:
285                         throw logic_error("bad parser state");
286                 }
287
288                 if(is_delimiter(next) && state>=ACCEPT)
289                 {
290                         if(state==IDENTIFIER && buf[0]=='\\')
291                                 return Token(Token::IDENTIFIER, buf.substr(1));
292                         else if(state==STRING_END)
293                                 return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
294                         else if(state==STRING_BASE64 || state==STRING_BASE64_PAD)
295                                 return Token(Token::STRING, base64_decode(buf));
296                         else
297                                 return Token(token_type[state], buf);
298                 }
299         }
300
301         return Token(Token::SPECIAL, "");
302 }
303
304 bool TextParser::is_delimiter(int c)
305 {
306         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
307 }
308
309 bool TextParser::isodigit(int c)
310 {
311         return (c>='0' && c<='7');
312 }
313
314 string TextParser::base64_decode(const string &data)
315 {
316         string bin;
317         bin.reserve(data.size()*3/4);
318         unsigned accum = 0;
319         unsigned a_bits = 0;
320         for(char c: data)
321         {
322                 unsigned d;
323                 if(c>='A' && c<='Z')
324                         d = c-'A';
325                 else if(c>='a' && c<='z')
326                         d = 26+(c-'a');
327                 else if(c>='0' && c<='9')
328                         d = 52+(c-'0');
329                 else if(c=='+')
330                         d = 62;
331                 else if(c=='/')
332                         d = 63;
333                 else if(c=='=')
334                         continue;
335                 else
336                         throw invalid_argument("TextParser::base64_decode");
337
338                 accum = (accum<<6)|d;
339                 a_bits += 6;
340
341                 if(a_bits>=8)
342                 {
343                         bin += (accum>>(a_bits-8))&0xFF;
344                         a_bits -= 8;
345                 }
346         }
347
348         return bin;
349 }
350
351 } // namespace DataFile
352 } // namespace Msp