Implement a parser mode for JSON files
authorMikko Rasa <tdb@tdb.fi>
Fri, 2 Sep 2016 12:34:48 +0000 (15:34 +0300)
committerMikko Rasa <tdb@tdb.fi>
Fri, 2 Sep 2016 12:34:48 +0000 (15:34 +0300)
source/jsonparser.cpp [new file with mode: 0644]
source/jsonparser.h [new file with mode: 0644]
source/parser.cpp

diff --git a/source/jsonparser.cpp b/source/jsonparser.cpp
new file mode 100644 (file)
index 0000000..4b93703
--- /dev/null
@@ -0,0 +1,377 @@
+#include <msp/stringcodec/utf8.h>
+#include "except.h"
+#include "input.h"
+#include "jsonparser.h"
+
+using namespace std;
+
+namespace Msp {
+namespace DataFile {
+
+JsonParser::JsonParser(Input &i, const string &s):
+       ParserMode(i, s),
+       toplevel_state(STATE_INIT)
+{ }
+
+Statement JsonParser::parse()
+{
+       if(toplevel_state==STATE_END)
+               return Statement();
+
+       bool was_init = (toplevel_state==STATE_INIT);
+       Token token = parse_token();
+       if(toplevel_state==STATE_INIT)
+       {
+               if(token.str=="[")
+                       toplevel_state = STATE_ARRAY;
+               else if(token.str=="{")
+                       toplevel_state = STATE_OBJECT;
+               else
+               {
+                       // TODO Standalone simple values; does anyone use them?
+                       toplevel_state = STATE_END;
+                       throw syntax_error(token.str);
+               }
+
+               token = parse_token();
+       }
+
+       if((toplevel_state==STATE_ARRAY && token.str=="]") || (toplevel_state==STATE_OBJECT && token.str=="}"))
+       {
+               toplevel_state = STATE_END;
+               return Statement();
+       }
+       else if(!was_init)
+       {
+               if(token.str!=",")
+                       throw syntax_error(token.str);
+
+               token = parse_token();
+       }
+
+       return parse_statement(&token, toplevel_state, string());
+}
+
+Statement JsonParser::parse_statement(const Token *t, State outer_state, const string &outer_kw)
+{
+       enum ParseState
+       {
+               INIT,
+               NAME,
+               VALUE,
+               ARRAY_INIT,
+               ARRAY,
+               ARRAY_ELEMENT,
+               OBJECT_INIT,
+               OBJECT,
+               OBJECT_MEMBER
+       };
+
+       Statement result;
+       ParseState state = INIT;
+
+       if(outer_state==STATE_ARRAY)
+       {
+               result.keyword = outer_kw+"[]";
+               state = VALUE;
+       }
+
+       while(in)
+       {
+               Token token;
+               if(t)
+               {
+                       token = *t;
+                       t = 0;
+               }
+               else
+                       token = parse_token();
+
+               if(!result.valid)
+               {
+                       result.valid = true;
+                       result.source = src;
+                       result.line = in.get_line_number();
+               }
+
+               if(state==INIT)
+               {
+                       if(token.type!=Token::STRING)
+                               throw syntax_error(token.str);
+
+                       result.keyword = token.str;
+                       state = NAME;
+               }
+               else if((state==ARRAY_INIT || state==ARRAY_ELEMENT) && token.str=="]")
+                       break;
+               else if((state==ARRAY_INIT || state==ARRAY))
+               {
+                       Statement ss = parse_statement(&token, STATE_ARRAY, result.keyword);
+                       result.sub.push_back(ss);
+                       state = ARRAY_ELEMENT;
+               }
+               else if(state==ARRAY_ELEMENT && token.str==",")
+                       state = ARRAY;
+               else if((state==OBJECT_INIT || state==OBJECT_MEMBER) && token.str=="}")
+                       break;
+               else if((state==OBJECT_INIT || state==OBJECT))
+               {
+                       Statement ss = parse_statement(&token, STATE_OBJECT, result.keyword);
+                       result.sub.push_back(ss);
+                       state = OBJECT_MEMBER;
+               }
+               else if(state==OBJECT_MEMBER && token.str==",")
+                       state = OBJECT;
+               else if(state==NAME && token.str==":")
+                       state = VALUE;
+               else if(state==VALUE)
+               {
+                       if(token.str=="[")
+                               state = ARRAY_INIT;
+                       else if(token.str=="{")
+                               state = OBJECT_INIT;
+                       else if(token.type!=Token::SPECIAL)
+                       {
+                               result.append_from_token(token);
+                               break;
+                       }
+                       else
+                               throw syntax_error(token.str);
+               }
+               else
+                       throw syntax_error(token.str);
+       }
+
+       return result;
+}
+
+Token JsonParser::parse_token()
+{
+       int c = 0;
+
+       while(in)
+       {
+               c = in.get();
+               if(!isspace(c))
+                       break;
+       }
+
+       if(!in)
+               return Token(Token::SPECIAL, "");
+
+       enum ParseState
+       {
+               INIT,
+               SIGN,
+               FLOATEXPINIT,
+               FLOATEXPSIGN,
+               STRING,
+               STRING_ESCAPE,
+               ACCEPT,
+               DECIMAL,
+               FLOAT,
+               FLOATEXP,
+               STRING_END,
+               IDENTIFIER
+       };
+
+       static Token::Type token_type[]=
+       {
+               Token::SPECIAL,
+               Token::SPECIAL,
+               Token::SPECIAL,
+               Token::SPECIAL,
+               Token::SPECIAL,
+               Token::SPECIAL,
+               Token::SPECIAL,
+               Token::INTEGER,
+               Token::FLOAT,
+               Token::FLOAT,
+               Token::STRING,
+               Token::IDENTIFIER
+       };
+
+       ParseState state = INIT;
+       string buf;
+
+       while(1)
+       {
+               if(state!=INIT)
+                       c = in.get();
+               int next = in.peek();
+
+               buf += c;
+
+               switch(state)
+               {
+               case INIT:
+                       if(c=='-' || c=='+')
+                               state = SIGN;
+                       else if(c=='.')
+                               state = FLOAT;
+                       else if(c=='"')
+                               state = STRING;
+                       else if(c=='{' || c=='}' || c=='[' || c==']' || c==':' || c==',')
+                               return Token(Token::SPECIAL, string(1, c));
+                       else if(isdigit(c))
+                               state = DECIMAL;
+                       else if(isalpha(c))
+                               state = IDENTIFIER;
+                       else
+                               throw parse_error(buf);
+                       break;
+
+               case SIGN:
+                       if(isdigit(c))
+                               state = DECIMAL;
+                       else if(c=='.')
+                               state = FLOAT;
+                       else
+                               throw parse_error(buf);
+                       break;
+
+               case DECIMAL:
+                       if(c=='.')
+                               state = FLOAT;
+                       else if(c=='e' || c=='E')
+                               state = FLOATEXPINIT;
+                       else if(!isdigit(c))
+                               throw parse_error(buf);
+                       break;
+
+               case FLOAT:
+                       if(c=='e' || c=='E')
+                               state = FLOATEXPINIT;
+                       else if(!isdigit(c))
+                               throw parse_error(buf);
+                       break;
+
+               case FLOATEXPINIT:
+                       if(c=='+' || c=='-')
+                               state = FLOATEXPSIGN;
+                       else if(isdigit(c))
+                               state = FLOATEXP;
+                       else
+                               throw parse_error(buf);
+                       break;
+
+               case FLOATEXPSIGN:
+                       if(isdigit(c))
+                               state = FLOATEXP;
+                       else
+                               throw parse_error(buf);
+                       break;
+
+               case FLOATEXP:
+                       if(!isdigit(c))
+                               throw parse_error(buf);
+                       break;
+
+               case STRING:
+                       if(c=='\\')
+                               state = STRING_ESCAPE;
+                       else if(c=='"')
+                               state = STRING_END;
+                       break;
+
+               case STRING_ESCAPE:
+                       state = STRING;
+                       break;
+
+               case IDENTIFIER:
+                       if(!isalpha(c))
+                               throw parse_error(buf);
+                       break;
+
+               case STRING_END:
+                       throw parse_error(buf);
+
+               default:
+                       throw logic_error("bad parser state");
+               }
+
+               if(is_delimiter(next) && state>=ACCEPT)
+               {
+                       if(state==STRING_END)
+                               return Token(Token::STRING, unescape(buf.substr(1, buf.size()-2)));
+                       else
+                               return Token(token_type[state], buf);
+               }
+       }
+}
+
+bool JsonParser::is_delimiter(int c)
+{
+       return (isspace(c) || c=='{' || c=='}' || c=='[' || c==']' || c==':' || c==',');
+}
+
+string JsonParser::unescape(const string &str)
+{
+       string result;
+       StringCodec::Utf8::Decoder dec;
+       StringCodec::Utf8::Encoder enc;
+       bool escape = false;
+
+       for(string::const_iterator i=str.begin(); i!=str.end(); )
+       {
+               StringCodec::unichar c = dec.decode_char(str, i);
+
+               if(escape)
+               {
+                       if(c=='\"')
+                               enc.encode_char('\"', result);
+                       else if(c=='\\')
+                               enc.encode_char('\\', result);
+                       else if(c=='/')
+                               enc.encode_char('/', result);
+                       else if(c=='b')
+                               enc.encode_char('\b', result);
+                       else if(c=='f')
+                               enc.encode_char('\f', result);
+                       else if(c=='n')
+                               enc.encode_char('\n', result);
+                       else if(c=='r')
+                               enc.encode_char('\r', result);
+                       else if(c=='t')
+                               enc.encode_char('\t', result);
+                       else if(c=='u')
+                       {
+                               unsigned code = 0;
+                               for(unsigned n=0; n<4; ++n)
+                               {
+                                       if(i==str.end())
+                                               throw invalid_argument("JsonParser::unescape");
+
+                                       c = dec.decode_char(str, i);
+
+                                       unsigned digit = 0;
+                                       if(c>='0' && c<='9')
+                                               digit = c-'0';
+                                       else if(c>='a' && c<='f')
+                                               digit = c-'a'+10;
+                                       else if(c>='A' && c<='F')
+                                               digit = c-'A'+10;
+                                       else
+                                               throw invalid_argument("JsonParser::unescape");
+
+                                       code = (code<<4)+digit;
+                               }
+
+                               enc.encode_char(code, result);
+                       }
+                       else
+                               throw invalid_argument("JsonParser::unescape");
+
+                       escape = false;
+               }
+               else if(c=='\\')
+                       escape = true;
+               else
+                       enc.encode_char(c, result);
+       }
+
+       return result;
+}
+
+} // namespace DataFile
+} // namespace Msp
diff --git a/source/jsonparser.h b/source/jsonparser.h
new file mode 100644 (file)
index 0000000..611eea0
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef MSP_DATAFILE_JSONPARSER_H_
+#define MSP_DATAFILE_JSONPARSER_H_
+
+#include "parsermode.h"
+#include "token.h"
+
+namespace Msp {
+namespace DataFile {
+
+class JsonParser: public ParserMode
+{
+private:
+       enum State
+       {
+               STATE_INIT,
+               STATE_ARRAY,
+               STATE_OBJECT,
+               STATE_END
+       };
+
+       State toplevel_state;
+
+public:
+       JsonParser(Input &, const std::string &);
+
+       virtual Statement parse();
+private:
+       Statement parse_statement(const Token *, State, const std::string &);
+       Token parse_token();
+       bool is_delimiter(int);
+       std::string unescape(const std::string &);
+};
+
+} // namespace DataFile
+} // namespace Msp
+
+#endif
index 5972e2d..7e4d3dd 100644 (file)
@@ -1,6 +1,7 @@
 #include <msp/strings/format.h>
 #include "binaryparser.h"
 #include "except.h"
 #include <msp/strings/format.h>
 #include "binaryparser.h"
 #include "except.h"
+#include "jsonparser.h"
 #include "parser.h"
 #include "statement.h"
 #include "textparser.h"
 #include "parser.h"
 #include "statement.h"
 #include "textparser.h"
@@ -14,9 +15,14 @@ Parser::Parser(IO::Base &i, const string &s):
        in(i),
        main_src(s),
        src(s),
        in(i),
        main_src(s),
        src(s),
-       good(true),
-       mode(new TextParser(in, src))
-{ }
+       good(true)
+{
+       char c = in.peek();
+       if(c=='{' || c=='[')
+               mode = new JsonParser(in, src);
+       else
+               mode = new TextParser(in, src);
+}
 
 Parser::~Parser()
 {
 
 Parser::~Parser()
 {