]> git.tdb.fi Git - libs/datafile.git/blob - source/jsonparser.cpp
Implement a parser mode for JSON files
[libs/datafile.git] / source / jsonparser.cpp
1 #include <msp/stringcodec/utf8.h>
2 #include "except.h"
3 #include "input.h"
4 #include "jsonparser.h"
5
6 using namespace std;
7
8 namespace Msp {
9 namespace DataFile {
10
11 JsonParser::JsonParser(Input &i, const string &s):
12         ParserMode(i, s),
13         toplevel_state(STATE_INIT)
14 { }
15
16 Statement JsonParser::parse()
17 {
18         if(toplevel_state==STATE_END)
19                 return Statement();
20
21         bool was_init = (toplevel_state==STATE_INIT);
22         Token token = parse_token();
23         if(toplevel_state==STATE_INIT)
24         {
25                 if(token.str=="[")
26                         toplevel_state = STATE_ARRAY;
27                 else if(token.str=="{")
28                         toplevel_state = STATE_OBJECT;
29                 else
30                 {
31                         // TODO Standalone simple values; does anyone use them?
32                         toplevel_state = STATE_END;
33                         throw syntax_error(token.str);
34                 }
35
36                 token = parse_token();
37         }
38
39         if((toplevel_state==STATE_ARRAY && token.str=="]") || (toplevel_state==STATE_OBJECT && token.str=="}"))
40         {
41                 toplevel_state = STATE_END;
42                 return Statement();
43         }
44         else if(!was_init)
45         {
46                 if(token.str!=",")
47                         throw syntax_error(token.str);
48
49                 token = parse_token();
50         }
51
52         return parse_statement(&token, toplevel_state, string());
53 }
54
55 Statement JsonParser::parse_statement(const Token *t, State outer_state, const string &outer_kw)
56 {
57         enum ParseState
58         {
59                 INIT,
60                 NAME,
61                 VALUE,
62                 ARRAY_INIT,
63                 ARRAY,
64                 ARRAY_ELEMENT,
65                 OBJECT_INIT,
66                 OBJECT,
67                 OBJECT_MEMBER
68         };
69
70         Statement result;
71         ParseState state = INIT;
72
73         if(outer_state==STATE_ARRAY)
74         {
75                 result.keyword = outer_kw+"[]";
76                 state = VALUE;
77         }
78
79         while(in)
80         {
81                 Token token;
82                 if(t)
83                 {
84                         token = *t;
85                         t = 0;
86                 }
87                 else
88                         token = parse_token();
89
90                 if(!result.valid)
91                 {
92                         result.valid = true;
93                         result.source = src;
94                         result.line = in.get_line_number();
95                 }
96
97                 if(state==INIT)
98                 {
99                         if(token.type!=Token::STRING)
100                                 throw syntax_error(token.str);
101
102                         result.keyword = token.str;
103                         state = NAME;
104                 }
105                 else if((state==ARRAY_INIT || state==ARRAY_ELEMENT) && token.str=="]")
106                         break;
107                 else if((state==ARRAY_INIT || state==ARRAY))
108                 {
109                         Statement ss = parse_statement(&token, STATE_ARRAY, result.keyword);
110                         result.sub.push_back(ss);
111                         state = ARRAY_ELEMENT;
112                 }
113                 else if(state==ARRAY_ELEMENT && token.str==",")
114                         state = ARRAY;
115                 else if((state==OBJECT_INIT || state==OBJECT_MEMBER) && token.str=="}")
116                         break;
117                 else if((state==OBJECT_INIT || state==OBJECT))
118                 {
119                         Statement ss = parse_statement(&token, STATE_OBJECT, result.keyword);
120                         result.sub.push_back(ss);
121                         state = OBJECT_MEMBER;
122                 }
123                 else if(state==OBJECT_MEMBER && token.str==",")
124                         state = OBJECT;
125                 else if(state==NAME && token.str==":")
126                         state = VALUE;
127                 else if(state==VALUE)
128                 {
129                         if(token.str=="[")
130                                 state = ARRAY_INIT;
131                         else if(token.str=="{")
132                                 state = OBJECT_INIT;
133                         else if(token.type!=Token::SPECIAL)
134                         {
135                                 result.append_from_token(token);
136                                 break;
137                         }
138                         else
139                                 throw syntax_error(token.str);
140                 }
141                 else
142                         throw syntax_error(token.str);
143         }
144
145         return result;
146 }
147
148 Token JsonParser::parse_token()
149 {
150         int c = 0;
151
152         while(in)
153         {
154                 c = in.get();
155                 if(!isspace(c))
156                         break;
157         }
158
159         if(!in)
160                 return Token(Token::SPECIAL, "");
161
162         enum ParseState
163         {
164                 INIT,
165                 SIGN,
166                 FLOATEXPINIT,
167                 FLOATEXPSIGN,
168                 STRING,
169                 STRING_ESCAPE,
170                 ACCEPT,
171                 DECIMAL,
172                 FLOAT,
173                 FLOATEXP,
174                 STRING_END,
175                 IDENTIFIER
176         };
177
178         static Token::Type token_type[]=
179         {
180                 Token::SPECIAL,
181                 Token::SPECIAL,
182                 Token::SPECIAL,
183                 Token::SPECIAL,
184                 Token::SPECIAL,
185                 Token::SPECIAL,
186                 Token::SPECIAL,
187                 Token::INTEGER,
188                 Token::FLOAT,
189                 Token::FLOAT,
190                 Token::STRING,
191                 Token::IDENTIFIER
192         };
193
194         ParseState state = INIT;
195         string buf;
196
197         while(1)
198         {
199                 if(state!=INIT)
200                         c = in.get();
201                 int next = in.peek();
202
203                 buf += c;
204
205                 switch(state)
206                 {
207                 case INIT:
208                         if(c=='-' || c=='+')
209                                 state = SIGN;
210                         else if(c=='.')
211                                 state = FLOAT;
212                         else if(c=='"')
213                                 state = STRING;
214                         else if(c=='{' || c=='}' || c=='[' || c==']' || c==':' || c==',')
215                                 return Token(Token::SPECIAL, string(1, c));
216                         else if(isdigit(c))
217                                 state = DECIMAL;
218                         else if(isalpha(c))
219                                 state = IDENTIFIER;
220                         else
221                                 throw parse_error(buf);
222                         break;
223
224                 case SIGN:
225                         if(isdigit(c))
226                                 state = DECIMAL;
227                         else if(c=='.')
228                                 state = FLOAT;
229                         else
230                                 throw parse_error(buf);
231                         break;
232
233                 case DECIMAL:
234                         if(c=='.')
235                                 state = FLOAT;
236                         else if(c=='e' || c=='E')
237                                 state = FLOATEXPINIT;
238                         else if(!isdigit(c))
239                                 throw parse_error(buf);
240                         break;
241
242                 case FLOAT:
243                         if(c=='e' || c=='E')
244                                 state = FLOATEXPINIT;
245                         else if(!isdigit(c))
246                                 throw parse_error(buf);
247                         break;
248
249                 case FLOATEXPINIT:
250                         if(c=='+' || c=='-')
251                                 state = FLOATEXPSIGN;
252                         else if(isdigit(c))
253                                 state = FLOATEXP;
254                         else
255                                 throw parse_error(buf);
256                         break;
257
258                 case FLOATEXPSIGN:
259                         if(isdigit(c))
260                                 state = FLOATEXP;
261                         else
262                                 throw parse_error(buf);
263                         break;
264
265                 case FLOATEXP:
266                         if(!isdigit(c))
267                                 throw parse_error(buf);
268                         break;
269
270                 case STRING:
271                         if(c=='\\')
272                                 state = STRING_ESCAPE;
273                         else if(c=='"')
274                                 state = STRING_END;
275                         break;
276
277                 case STRING_ESCAPE:
278                         state = STRING;
279                         break;
280
281                 case IDENTIFIER:
282                         if(!isalpha(c))
283                                 throw parse_error(buf);
284                         break;
285
286                 case STRING_END:
287                         throw parse_error(buf);
288
289                 default:
290                         throw logic_error("bad parser state");
291                 }
292
293                 if(is_delimiter(next) && state>=ACCEPT)
294                 {
295                         if(state==STRING_END)
296                                 return Token(Token::STRING, unescape(buf.substr(1, buf.size()-2)));
297                         else
298                                 return Token(token_type[state], buf);
299                 }
300         }
301 }
302
303 bool JsonParser::is_delimiter(int c)
304 {
305         return (isspace(c) || c=='{' || c=='}' || c=='[' || c==']' || c==':' || c==',');
306 }
307
308 string JsonParser::unescape(const string &str)
309 {
310         string result;
311         StringCodec::Utf8::Decoder dec;
312         StringCodec::Utf8::Encoder enc;
313         bool escape = false;
314
315         for(string::const_iterator i=str.begin(); i!=str.end(); )
316         {
317                 StringCodec::unichar c = dec.decode_char(str, i);
318
319                 if(escape)
320                 {
321                         if(c=='\"')
322                                 enc.encode_char('\"', result);
323                         else if(c=='\\')
324                                 enc.encode_char('\\', result);
325                         else if(c=='/')
326                                 enc.encode_char('/', result);
327                         else if(c=='b')
328                                 enc.encode_char('\b', result);
329                         else if(c=='f')
330                                 enc.encode_char('\f', result);
331                         else if(c=='n')
332                                 enc.encode_char('\n', result);
333                         else if(c=='r')
334                                 enc.encode_char('\r', result);
335                         else if(c=='t')
336                                 enc.encode_char('\t', result);
337                         else if(c=='u')
338                         {
339                                 unsigned code = 0;
340                                 for(unsigned n=0; n<4; ++n)
341                                 {
342                                         if(i==str.end())
343                                                 throw invalid_argument("JsonParser::unescape");
344
345                                         c = dec.decode_char(str, i);
346
347                                         unsigned digit = 0;
348                                         if(c>='0' && c<='9')
349                                                 digit = c-'0';
350                                         else if(c>='a' && c<='f')
351                                                 digit = c-'a'+10;
352                                         else if(c>='A' && c<='F')
353                                                 digit = c-'A'+10;
354                                         else
355                                                 throw invalid_argument("JsonParser::unescape");
356
357                                         code = (code<<4)+digit;
358                                 }
359
360                                 enc.encode_char(code, result);
361                         }
362                         else
363                                 throw invalid_argument("JsonParser::unescape");
364
365                         escape = false;
366                 }
367                 else if(c=='\\')
368                         escape = true;
369                 else
370                         enc.encode_char(c, result);
371         }
372
373         return result;
374 }
375
376 } // namespace DataFile
377 } // namespace Msp