source/textparser.cpp

   1 /* $Id$
   2
   3 This file is part of libmspdatafile
   4 Copyright © 2007-2008, 2010  Mikko Rasa, Mikkosoft Productions
   5 Distributed under the LGPL
   6 */
   7
   8 #include <msp/strings/formatter.h>
   9 #include <msp/strings/utils.h>
  10 #include "input.h"
  11 #include "textparser.h"
  12 #include "token.h"
  13
  14 using namespace std;
  15
  16 namespace Msp {
  17 namespace DataFile {
  18
  19 TextParser::TextParser(Input &i, const string &s):
  20         ParserMode(i, s)
  21 { }
  22
  23 Statement TextParser::parse()
  24 {
  25         return parse_statement(0);
  26 }
  27
  28 Statement TextParser::parse_statement(const Token *t)
  29 {
  30         Statement result;
  31         bool      sub = false;
  32         bool      finish = false;
  33
  34         while(in)
  35         {
  36                 Token token;
  37                 if(t)
  38                 {
  39                         token = *t;
  40                         t = 0;
  41                 }
  42                 else
  43                         token = parse_token();
  44
  45                 if(result.keyword.empty())
  46                 {
  47                         if(token.str.empty())
  48                                 break;
  49                         else if(token.type!=Token::IDENTIFIER)
  50                                 throw_at(ParseError(format("Syntax error at token '%s' (expected an identifier)", token.str)), get_location());
  51                         result.keyword = token.str;
  52                         result.valid = true;
  53                         result.source = src;
  54                         result.line = in.get_line_number();
  55                 }
  56                 else if(sub)
  57                 {
  58                         if(token.str=="}")
  59                         {
  60                                 sub = false;
  61                                 finish = true;
  62                         }
  63                         else
  64                         {
  65                                 Statement ss = parse_statement(&token);
  66                                 result.sub.push_back(ss);
  67                         }
  68                 }
  69                 else if(finish)
  70                 {
  71                         if(token.str!=";")
  72                                 throw_at(ParseError(format("Syntax error at token '%s' (Expected a ';')", token.str)), get_location());
  73                         break;
  74                 }
  75                 else if(token.str=="{")
  76                         sub = true;
  77                 else if(token.str==";")
  78                         break;
  79                 else if(token.type==Token::INTEGER)
  80                         result.append(lexical_cast<IntType::Store>(token.str));
  81                 else if(token.type==Token::FLOAT)
  82                         result.append(lexical_cast<FloatType::Store>(token.str));
  83                 else if(token.type==Token::STRING)
  84                         result.append(token.str);
  85                 else if(token.type==Token::IDENTIFIER)
  86                 {
  87                         if(token.str=="true")
  88                                 result.append(true);
  89                         else if(token.str=="false")
  90                                 result.append(false);
  91                         else
  92                                 result.append(Symbol(token.str));
  93                 }
  94                 else if(token.str=="")
  95                         throw_at(ParseError("Unexcepted end of input"), get_location());
  96                 else
  97                         throw_at(ParseError("Syntax error"), get_location());
  98         }
  99
 100         return result;
 101 }
 102
 103 Token TextParser::parse_token()
 104 {
 105         int c = 0;
 106         int comment = 0;
 107
 108         // Skip over comments and whitespace
 109         while(in && comment>=0)
 110         {
 111                 c = in.get();
 112                 int next = in.peek();
 113
 114                 if(c=='/' && next=='/')
 115                         comment = 1;
 116                 else if(c=='/' && next=='*')
 117                         comment = 2;
 118                 else if(c=='\n' && comment==1)
 119                         comment = 0;
 120                 else if(c=='*' && next=='/' && comment==2)
 121                         comment = 3;
 122                 else if(comment==3)   // Skip the second character of block comment end
 123                         comment = 0;
 124                 else if(c!=-1 && !isspace(c) && !comment)
 125                         comment = -1;
 126         }
 127
 128         if(comment>0)  // EOF while in comment
 129                 throw_at(ParseError("Unfinished comment at end of input"), get_location());
 130         else if(comment==0)  // Didn't hit any non-whitespace
 131                 return Token(Token::SPECIAL, "");
 132
 133         enum ParseState
 134         {
 135                 INIT,
 136                 SIGN,
 137                 FLOATEXPINIT,
 138                 FLOATEXPSIGN,
 139                 STRING,
 140                 ACCEPT,
 141                 ZERO,
 142                 DECIMAL,
 143                 HEXADECIMAL,
 144                 OCTAL,
 145                 FLOAT,
 146                 FLOATEXP,
 147                 IDENTIFIER
 148         };
 149
 150         static Token::Type token_type[]=
 151         {
 152                 Token::SPECIAL,
 153                 Token::SPECIAL,
 154                 Token::SPECIAL,
 155                 Token::SPECIAL,
 156                 Token::STRING,
 157                 Token::SPECIAL,
 158                 Token::INTEGER,
 159                 Token::INTEGER,
 160                 Token::INTEGER,
 161                 Token::INTEGER,
 162                 Token::FLOAT,
 163                 Token::FLOAT,
 164                 Token::IDENTIFIER
 165         };
 166
 167         ParseState state = INIT;
 168         string     buf;
 169         bool       escape = false;
 170
 171         while(in || state==INIT)
 172         {
 173                 if(state!=INIT)
 174                         c = in.get();
 175                 int next = in.peek();
 176
 177                 buf += c;
 178
 179                 switch(state)
 180                 {
 181                 case INIT:
 182                         if(c=='0')
 183                                 state = ZERO;
 184                         else if(c=='-' || c=='+')
 185                                 state = SIGN;
 186                         else if(c=='.')
 187                                 state = FLOAT;
 188                         else if(c=='"')
 189                                 state = STRING;
 190                         else if(c=='{' || c=='}' || c==';')
 191                                 return Token(Token::SPECIAL, string(1, c));
 192                         else if(isdigit(c))
 193                                 state = DECIMAL;
 194                         else if(isalpha(c) || c=='_')
 195                                 state = IDENTIFIER;
 196                         else
 197                                 parse_error(c, "0-9A-Za-z_.\"{};+-");
 198                         break;
 199
 200                 case SIGN:
 201                         if(c=='0')
 202                                 state = ZERO;
 203                         else if(isdigit(c))
 204                                 state = DECIMAL;
 205                         else if(c=='.')
 206                                 state = FLOAT;
 207                         else
 208                                 parse_error(c, "0-9.");
 209                         break;
 210
 211                 case ZERO:
 212                         if(c=='x')
 213                                 state = HEXADECIMAL;
 214                         else if(isdigit(c))
 215                                 state = OCTAL;
 216                         else if(c=='.')
 217                                 state = FLOAT;
 218                         else
 219                                 parse_error(c, "0-9A-Fa-f.");
 220                         break;
 221
 222                 case DECIMAL:
 223                         if(c=='.')
 224                                 state = FLOAT;
 225                         else if(!isdigit(c))
 226                                 parse_error(c, "0-9.");
 227                         break;
 228
 229                 case HEXADECIMAL:
 230                         if(!isxdigit(c))
 231                                 parse_error(c, "0-9A-Fa-f");
 232                         break;
 233
 234                 case OCTAL:
 235                         if(!isodigit(c))
 236                                 parse_error(c, "0-7");
 237                         break;
 238
 239                 case FLOAT:
 240                         if(c=='e' || c=='E')
 241                                 state = FLOATEXPINIT;
 242                         else if(!isdigit(c))
 243                                 parse_error(c, "0-9Ee");
 244                         break;
 245
 246                 case FLOATEXPINIT:
 247                         if(c=='+' || c=='-')
 248                                 state = FLOATEXPSIGN;
 249                         else if(isdigit(c))
 250                                 state = FLOATEXP;
 251                         else
 252                                 parse_error(c, "0-9+-");
 253                         break;
 254
 255                 case FLOATEXPSIGN:
 256                         if(isdigit(c))
 257                                 state = FLOATEXP;
 258                         else
 259                                 parse_error(c, "0-9");
 260                         break;
 261
 262                 case FLOATEXP:
 263                         if(!isdigit(c))
 264                                 parse_error(c, "0-9");
 265                         break;
 266
 267                 case STRING:
 268                         if(c=='\\')
 269                                 escape = !escape;
 270                         else if(c=='"' && !escape)
 271                         {
 272                                 try
 273                                 {
 274                                         return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
 275                                 }
 276                                 catch(Exception &e)
 277                                 {
 278                                         e.at(get_location());
 279                                         throw;
 280                                 }
 281                         }
 282                         else
 283                                 escape = false;
 284                         break;
 285
 286                 case IDENTIFIER:
 287                         if(!isalpha(c) && !isdigit(c) && c!='_')
 288                                 parse_error(c, "0-9A-Za-z_");
 289                         break;
 290
 291                 default:
 292                         throw_at(InvalidState("Internal error (bad state)"), get_location());
 293                 }
 294
 295                 if(is_delimiter(next) && state>=ACCEPT)
 296                         return Token(token_type[state], buf);
 297         }
 298
 299         return Token(Token::SPECIAL, "");
 300 }
 301
 302 bool TextParser::is_delimiter(int c)
 303 {
 304         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
 305 }
 306
 307 bool TextParser::isodigit(int c)
 308 {
 309         return (c>='0' && c<='7');
 310 }
 311
 312 string TextParser::get_location()
 313 {
 314         ostringstream ss;
 315         ss<<src<<':'<<in.get_line_number();
 316         return ss.str();
 317 }
 318
 319 void TextParser::parse_error(int c, const char *e)
 320 {
 321         throw_at(ParseError(format("Parse error at '%c', expected one of \"%s\"", static_cast<char>(c), e)), get_location());
 322 }
 323
 324 } // namespace DataFile
 325 } // namespace Msp