From 705aca1ca0f63e7314a25f528e6e7c76765c04b8 Mon Sep 17 00:00:00 2001 From: Mikko Rasa Date: Fri, 2 Sep 2016 15:34:48 +0300 Subject: [PATCH] Implement a parser mode for JSON files --- source/jsonparser.cpp | 377 ++++++++++++++++++++++++++++++++++++++++++ source/jsonparser.h | 37 +++++ source/parser.cpp | 12 +- 3 files changed, 423 insertions(+), 3 deletions(-) create mode 100644 source/jsonparser.cpp create mode 100644 source/jsonparser.h diff --git a/source/jsonparser.cpp b/source/jsonparser.cpp new file mode 100644 index 0000000..4b93703 --- /dev/null +++ b/source/jsonparser.cpp @@ -0,0 +1,377 @@ +#include +#include "except.h" +#include "input.h" +#include "jsonparser.h" + +using namespace std; + +namespace Msp { +namespace DataFile { + +JsonParser::JsonParser(Input &i, const string &s): + ParserMode(i, s), + toplevel_state(STATE_INIT) +{ } + +Statement JsonParser::parse() +{ + if(toplevel_state==STATE_END) + return Statement(); + + bool was_init = (toplevel_state==STATE_INIT); + Token token = parse_token(); + if(toplevel_state==STATE_INIT) + { + if(token.str=="[") + toplevel_state = STATE_ARRAY; + else if(token.str=="{") + toplevel_state = STATE_OBJECT; + else + { + // TODO Standalone simple values; does anyone use them? + toplevel_state = STATE_END; + throw syntax_error(token.str); + } + + token = parse_token(); + } + + if((toplevel_state==STATE_ARRAY && token.str=="]") || (toplevel_state==STATE_OBJECT && token.str=="}")) + { + toplevel_state = STATE_END; + return Statement(); + } + else if(!was_init) + { + if(token.str!=",") + throw syntax_error(token.str); + + token = parse_token(); + } + + return parse_statement(&token, toplevel_state, string()); +} + +Statement JsonParser::parse_statement(const Token *t, State outer_state, const string &outer_kw) +{ + enum ParseState + { + INIT, + NAME, + VALUE, + ARRAY_INIT, + ARRAY, + ARRAY_ELEMENT, + OBJECT_INIT, + OBJECT, + OBJECT_MEMBER + }; + + Statement result; + ParseState state = INIT; + + if(outer_state==STATE_ARRAY) + { + result.keyword = outer_kw+"[]"; + state = VALUE; + } + + while(in) + { + Token token; + if(t) + { + token = *t; + t = 0; + } + else + token = parse_token(); + + if(!result.valid) + { + result.valid = true; + result.source = src; + result.line = in.get_line_number(); + } + + if(state==INIT) + { + if(token.type!=Token::STRING) + throw syntax_error(token.str); + + result.keyword = token.str; + state = NAME; + } + else if((state==ARRAY_INIT || state==ARRAY_ELEMENT) && token.str=="]") + break; + else if((state==ARRAY_INIT || state==ARRAY)) + { + Statement ss = parse_statement(&token, STATE_ARRAY, result.keyword); + result.sub.push_back(ss); + state = ARRAY_ELEMENT; + } + else if(state==ARRAY_ELEMENT && token.str==",") + state = ARRAY; + else if((state==OBJECT_INIT || state==OBJECT_MEMBER) && token.str=="}") + break; + else if((state==OBJECT_INIT || state==OBJECT)) + { + Statement ss = parse_statement(&token, STATE_OBJECT, result.keyword); + result.sub.push_back(ss); + state = OBJECT_MEMBER; + } + else if(state==OBJECT_MEMBER && token.str==",") + state = OBJECT; + else if(state==NAME && token.str==":") + state = VALUE; + else if(state==VALUE) + { + if(token.str=="[") + state = ARRAY_INIT; + else if(token.str=="{") + state = OBJECT_INIT; + else if(token.type!=Token::SPECIAL) + { + result.append_from_token(token); + break; + } + else + throw syntax_error(token.str); + } + else + throw syntax_error(token.str); + } + + return result; +} + +Token JsonParser::parse_token() +{ + int c = 0; + + while(in) + { + c = in.get(); + if(!isspace(c)) + break; + } + + if(!in) + return Token(Token::SPECIAL, ""); + + enum ParseState + { + INIT, + SIGN, + FLOATEXPINIT, + FLOATEXPSIGN, + STRING, + STRING_ESCAPE, + ACCEPT, + DECIMAL, + FLOAT, + FLOATEXP, + STRING_END, + IDENTIFIER + }; + + static Token::Type token_type[]= + { + Token::SPECIAL, + Token::SPECIAL, + Token::SPECIAL, + Token::SPECIAL, + Token::SPECIAL, + Token::SPECIAL, + Token::SPECIAL, + Token::INTEGER, + Token::FLOAT, + Token::FLOAT, + Token::STRING, + Token::IDENTIFIER + }; + + ParseState state = INIT; + string buf; + + while(1) + { + if(state!=INIT) + c = in.get(); + int next = in.peek(); + + buf += c; + + switch(state) + { + case INIT: + if(c=='-' || c=='+') + state = SIGN; + else if(c=='.') + state = FLOAT; + else if(c=='"') + state = STRING; + else if(c=='{' || c=='}' || c=='[' || c==']' || c==':' || c==',') + return Token(Token::SPECIAL, string(1, c)); + else if(isdigit(c)) + state = DECIMAL; + else if(isalpha(c)) + state = IDENTIFIER; + else + throw parse_error(buf); + break; + + case SIGN: + if(isdigit(c)) + state = DECIMAL; + else if(c=='.') + state = FLOAT; + else + throw parse_error(buf); + break; + + case DECIMAL: + if(c=='.') + state = FLOAT; + else if(c=='e' || c=='E') + state = FLOATEXPINIT; + else if(!isdigit(c)) + throw parse_error(buf); + break; + + case FLOAT: + if(c=='e' || c=='E') + state = FLOATEXPINIT; + else if(!isdigit(c)) + throw parse_error(buf); + break; + + case FLOATEXPINIT: + if(c=='+' || c=='-') + state = FLOATEXPSIGN; + else if(isdigit(c)) + state = FLOATEXP; + else + throw parse_error(buf); + break; + + case FLOATEXPSIGN: + if(isdigit(c)) + state = FLOATEXP; + else + throw parse_error(buf); + break; + + case FLOATEXP: + if(!isdigit(c)) + throw parse_error(buf); + break; + + case STRING: + if(c=='\\') + state = STRING_ESCAPE; + else if(c=='"') + state = STRING_END; + break; + + case STRING_ESCAPE: + state = STRING; + break; + + case IDENTIFIER: + if(!isalpha(c)) + throw parse_error(buf); + break; + + case STRING_END: + throw parse_error(buf); + + default: + throw logic_error("bad parser state"); + } + + if(is_delimiter(next) && state>=ACCEPT) + { + if(state==STRING_END) + return Token(Token::STRING, unescape(buf.substr(1, buf.size()-2))); + else + return Token(token_type[state], buf); + } + } +} + +bool JsonParser::is_delimiter(int c) +{ + return (isspace(c) || c=='{' || c=='}' || c=='[' || c==']' || c==':' || c==','); +} + +string JsonParser::unescape(const string &str) +{ + string result; + StringCodec::Utf8::Decoder dec; + StringCodec::Utf8::Encoder enc; + bool escape = false; + + for(string::const_iterator i=str.begin(); i!=str.end(); ) + { + StringCodec::unichar c = dec.decode_char(str, i); + + if(escape) + { + if(c=='\"') + enc.encode_char('\"', result); + else if(c=='\\') + enc.encode_char('\\', result); + else if(c=='/') + enc.encode_char('/', result); + else if(c=='b') + enc.encode_char('\b', result); + else if(c=='f') + enc.encode_char('\f', result); + else if(c=='n') + enc.encode_char('\n', result); + else if(c=='r') + enc.encode_char('\r', result); + else if(c=='t') + enc.encode_char('\t', result); + else if(c=='u') + { + unsigned code = 0; + for(unsigned n=0; n<4; ++n) + { + if(i==str.end()) + throw invalid_argument("JsonParser::unescape"); + + c = dec.decode_char(str, i); + + unsigned digit = 0; + if(c>='0' && c<='9') + digit = c-'0'; + else if(c>='a' && c<='f') + digit = c-'a'+10; + else if(c>='A' && c<='F') + digit = c-'A'+10; + else + throw invalid_argument("JsonParser::unescape"); + + code = (code<<4)+digit; + } + + enc.encode_char(code, result); + } + else + throw invalid_argument("JsonParser::unescape"); + + escape = false; + } + else if(c=='\\') + escape = true; + else + enc.encode_char(c, result); + } + + return result; +} + +} // namespace DataFile +} // namespace Msp diff --git a/source/jsonparser.h b/source/jsonparser.h new file mode 100644 index 0000000..611eea0 --- /dev/null +++ b/source/jsonparser.h @@ -0,0 +1,37 @@ +#ifndef MSP_DATAFILE_JSONPARSER_H_ +#define MSP_DATAFILE_JSONPARSER_H_ + +#include "parsermode.h" +#include "token.h" + +namespace Msp { +namespace DataFile { + +class JsonParser: public ParserMode +{ +private: + enum State + { + STATE_INIT, + STATE_ARRAY, + STATE_OBJECT, + STATE_END + }; + + State toplevel_state; + +public: + JsonParser(Input &, const std::string &); + + virtual Statement parse(); +private: + Statement parse_statement(const Token *, State, const std::string &); + Token parse_token(); + bool is_delimiter(int); + std::string unescape(const std::string &); +}; + +} // namespace DataFile +} // namespace Msp + +#endif diff --git a/source/parser.cpp b/source/parser.cpp index 5972e2d..7e4d3dd 100644 --- a/source/parser.cpp +++ b/source/parser.cpp @@ -1,6 +1,7 @@ #include #include "binaryparser.h" #include "except.h" +#include "jsonparser.h" #include "parser.h" #include "statement.h" #include "textparser.h" @@ -14,9 +15,14 @@ Parser::Parser(IO::Base &i, const string &s): in(i), main_src(s), src(s), - good(true), - mode(new TextParser(in, src)) -{ } + good(true) +{ + char c = in.peek(); + if(c=='{' || c=='[') + mode = new JsonParser(in, src); + else + mode = new TextParser(in, src); +} Parser::~Parser() { -- 2.45.2