source/textparser.cpp

   1 /* $Id$
   2
   3 This file is part of libmspdatafile
   4 Copyright © 2007-2008  Mikko Rasa, Mikkosoft Productions
   5 Distributed under the LGPL
   6 */
   7
   8 #include <msp/strings/formatter.h>
   9 #include <msp/strings/utils.h>
  10 #include "input.h"
  11 #include "textparser.h"
  12 #include "token.h"
  13
  14 using namespace std;
  15
  16 namespace Msp {
  17 namespace DataFile {
  18
  19 TextParser::TextParser(Input &i, const string &s):
  20         ParserMode(i, s)
  21 { }
  22
  23 Statement TextParser::parse()
  24 {
  25         return parse_statement(0);
  26 }
  27
  28 Statement TextParser::parse_statement(const Token *t)
  29 {
  30         Statement result;
  31         bool      sub = false;
  32         bool      finish = false;
  33
  34         while(in)
  35         {
  36                 Token token;
  37                 if(t)
  38                 {
  39                         token = *t;
  40                         t = 0;
  41                 }
  42                 else
  43                         token = parse_token();
  44
  45                 if(result.keyword.empty())
  46                 {
  47                         if(token.str.empty())
  48                                 break;
  49                         else if(token.type!=Token::IDENTIFIER)
  50                                 throw_at(ParseError(format("Syntax error at token '%s' (expected an identifier)", token.str)), get_location());
  51                         result.keyword = token.str;
  52                         result.valid = true;
  53                         result.source = src;
  54                         result.line = in.get_line_number();
  55                 }
  56                 else if(sub)
  57                 {
  58                         if(token.str=="}")
  59                         {
  60                                 sub = false;
  61                                 finish = true;
  62                         }
  63                         else
  64                         {
  65                                 Statement ss = parse_statement(&token);
  66                                 result.sub.push_back(ss);
  67                         }
  68                 }
  69                 else if(finish)
  70                 {
  71                         if(token.str!=";")
  72                                 throw_at(ParseError(format("Syntax error at token '%s' (Expected a ';')", token.str)), get_location());
  73                         break;
  74                 }
  75                 else if(token.str=="{")
  76                         sub = true;
  77                 else if(token.str==";")
  78                         break;
  79                 else if(token.type==Token::INTEGER)
  80                         result.args.push_back(Value(INTEGER, token.str));
  81                 else if(token.type==Token::FLOAT)
  82                         result.args.push_back(Value(FLOAT, token.str));
  83                 else if(token.type==Token::STRING)
  84                         result.args.push_back(Value(STRING, token.str));
  85                 else if(token.type==Token::IDENTIFIER)
  86                 {
  87                         if(token.str=="true")
  88                                 result.args.push_back(Value(BOOLEAN, "1"));
  89                         else if(token.str=="false")
  90                                 result.args.push_back(Value(BOOLEAN, "0"));
  91                         else
  92                                 result.args.push_back(Value(ENUM, token.str));
  93                         //result.args.push_back(resolve_identifiertoken.str);
  94                 }
  95                 else if(token.str=="")
  96                         throw_at(ParseError("Unexcepted end of input"), get_location());
  97                 else
  98                         throw_at(ParseError("Syntax error"), get_location());
  99         }
 100
 101         return result;
 102 }
 103
 104 Token TextParser::parse_token()
 105 {
 106         int c = 0;
 107         int comment = 0;
 108
 109         // Skip over comments and whitespace
 110         while(in && comment>=0)
 111         {
 112                 c = in.get();
 113                 int next = in.peek();
 114
 115                 if(c=='/' && next=='/')
 116                         comment = 1;
 117                 else if(c=='/' && next=='*')
 118                         comment = 2;
 119                 else if(c=='\n' && comment==1)
 120                         comment = 0;
 121                 else if(c=='*' && next=='/' && comment==2)
 122                         comment = 3;
 123                 else if(comment==3)   // Skip the second character of block comment end
 124                         comment = 0;
 125                 else if(!isspace(c) && !comment)
 126                         comment = -1;
 127         }
 128
 129         if(comment>0)  // EOF while in comment
 130                 throw_at(ParseError("Unfinished comment at end of input"), get_location());
 131         else if(comment==0)  // Didn't hit any non-whitespace
 132                 return Token(Token::SPECIAL, "");
 133
 134         enum ParseState
 135         {
 136                 INIT,
 137                 SIGN,
 138                 FLOATEXPINIT,
 139                 FLOATEXPSIGN,
 140                 STRING,
 141                 ACCEPT,
 142                 ZERO,
 143                 DECIMAL,
 144                 HEXADECIMAL,
 145                 OCTAL,
 146                 FLOAT,
 147                 FLOATEXP,
 148                 IDENTIFIER
 149         };
 150
 151         static Token::Type token_type[]=
 152         {
 153                 Token::SPECIAL,
 154                 Token::SPECIAL,
 155                 Token::SPECIAL,
 156                 Token::SPECIAL,
 157                 Token::STRING,
 158                 Token::SPECIAL,
 159                 Token::INTEGER,
 160                 Token::INTEGER,
 161                 Token::INTEGER,
 162                 Token::INTEGER,
 163                 Token::FLOAT,
 164                 Token::FLOAT,
 165                 Token::IDENTIFIER
 166         };
 167
 168         ParseState state = INIT;
 169         string     buf;
 170         bool       escape = false;
 171
 172         while(in || state==INIT)
 173         {
 174                 if(state!=INIT)
 175                         c = in.get();
 176                 int next = in.peek();
 177
 178                 buf += c;
 179
 180                 switch(state)
 181                 {
 182                 case INIT:
 183                         if(c=='0')
 184                                 state = ZERO;
 185                         else if(c=='-' || c=='+')
 186                                 state = SIGN;
 187                         else if(c=='.')
 188                                 state = FLOAT;
 189                         else if(c=='"')
 190                                 state = STRING;
 191                         else if(c=='{' || c=='}' || c==';')
 192                                 return Token(Token::SPECIAL, string(1, c));
 193                         else if(isdigit(c))
 194                                 state = DECIMAL;
 195                         else if(isalpha(c) || c=='_')
 196                                 state = IDENTIFIER;
 197                         else
 198                                 parse_error(c, "0-9A-Za-z_.\"{};+-");
 199                         break;
 200
 201                 case SIGN:
 202                         if(c=='0')
 203                                 state = ZERO;
 204                         else if(isdigit(c))
 205                                 state = DECIMAL;
 206                         else if(c=='.')
 207                                 state = FLOAT;
 208                         else
 209                                 parse_error(c, "0-9.");
 210                         break;
 211
 212                 case ZERO:
 213                         if(c=='x')
 214                                 state = HEXADECIMAL;
 215                         else if(isdigit(c))
 216                                 state = OCTAL;
 217                         else if(c=='.')
 218                                 state = FLOAT;
 219                         else
 220                                 parse_error(c, "0-9A-Fa-f.");
 221                         break;
 222
 223                 case DECIMAL:
 224                         if(c=='.')
 225                                 state = FLOAT;
 226                         else if(!isdigit(c))
 227                                 parse_error(c, "0-9.");
 228                         break;
 229
 230                 case HEXADECIMAL:
 231                         if(!isxdigit(c))
 232                                 parse_error(c, "0-9A-Fa-f");
 233                         break;
 234
 235                 case OCTAL:
 236                         if(!isodigit(c))
 237                                 parse_error(c, "0-7");
 238                         break;
 239
 240                 case FLOAT:
 241                         if(c=='e' || c=='E')
 242                                 state = FLOATEXPINIT;
 243                         else if(!isdigit(c))
 244                                 parse_error(c, "0-9Ee");
 245                         break;
 246
 247                 case FLOATEXPINIT:
 248                         if(c=='+' || c=='-')
 249                                 state = FLOATEXPSIGN;
 250                         else if(isdigit(c))
 251                                 state = FLOATEXP;
 252                         else
 253                                 parse_error(c, "0-9+-");
 254                         break;
 255
 256                 case FLOATEXPSIGN:
 257                         if(isdigit(c))
 258                                 state = FLOATEXP;
 259                         else
 260                                 parse_error(c, "0-9");
 261                         break;
 262
 263                 case FLOATEXP:
 264                         if(!isdigit(c))
 265                                 parse_error(c, "0-9");
 266                         break;
 267
 268                 case STRING:
 269                         if(c=='\\')
 270                                 escape = !escape;
 271                         else if(c=='"' && !escape)
 272                         {
 273                                 try
 274                                 {
 275                                         return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
 276                                 }
 277                                 catch(Exception &e)
 278                                 {
 279                                         e.at(get_location());
 280                                         throw;
 281                                 }
 282                         }
 283                         else
 284                                 escape = false;
 285                         break;
 286
 287                 case IDENTIFIER:
 288                         if(!isalpha(c) && !isdigit(c) && c!='_')
 289                                 parse_error(c, "0-9A-Za-z_");
 290                         break;
 291
 292                 default:
 293                         throw_at(InvalidState("Internal error (bad state)"), get_location());
 294                 }
 295
 296                 if(is_delimiter(next) && state>=ACCEPT)
 297                         return Token(token_type[state], buf);
 298         }
 299
 300         return Token(Token::SPECIAL, "");
 301 }
 302
 303 bool TextParser::is_delimiter(int c)
 304 {
 305         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
 306 }
 307
 308 bool TextParser::isodigit(int c)
 309 {
 310         return (c>='0' && c<='7');
 311 }
 312
 313 string TextParser::get_location()
 314 {
 315         ostringstream ss;
 316         ss<<src<<':'<<in.get_line_number();
 317         return ss.str();
 318 }
 319
 320 void TextParser::parse_error(int c, const char *e)
 321 {
 322         throw_at(ParseError(format("Parse error at '%c', expected one of \"%s\"", static_cast<char>(c), e)), get_location());
 323 }
 324
 325 } // namespace DataFile
 326 } // namespace Msp