source/textparser.cpp

   1 #include <msp/strings/format.h>
   2 #include <msp/strings/utils.h>
   3 #include "input.h"
   4 #include "textparser.h"
   5 #include "token.h"
   6
   7 using namespace std;
   8
   9 namespace Msp {
  10 namespace DataFile {
  11
  12 TextParser::TextParser(Input &i, const string &s):
  13         ParserMode(i, s)
  14 { }
  15
  16 Statement TextParser::parse()
  17 {
  18         return parse_statement(0);
  19 }
  20
  21 Statement TextParser::parse_statement(const Token *t)
  22 {
  23         Statement result;
  24         bool      sub = false;
  25         bool      finish = false;
  26
  27         while(in)
  28         {
  29                 Token token;
  30                 if(t)
  31                 {
  32                         token = *t;
  33                         t = 0;
  34                 }
  35                 else
  36                         token = parse_token();
  37
  38                 if(result.keyword.empty())
  39                 {
  40                         if(token.str.empty())
  41                                 break;
  42                         else if(token.type!=Token::IDENTIFIER)
  43                                 throw_at(ParseError(format("Syntax error at token '%s' (expected an identifier)", token.str)), get_location());
  44                         result.keyword = token.str;
  45                         result.valid = true;
  46                         result.source = src;
  47                         result.line = in.get_line_number();
  48                 }
  49                 else if(sub)
  50                 {
  51                         if(token.str=="}")
  52                         {
  53                                 sub = false;
  54                                 finish = true;
  55                         }
  56                         else
  57                         {
  58                                 Statement ss = parse_statement(&token);
  59                                 result.sub.push_back(ss);
  60                         }
  61                 }
  62                 else if(finish)
  63                 {
  64                         if(token.str!=";")
  65                                 throw_at(ParseError(format("Syntax error at token '%s' (Expected a ';')", token.str)), get_location());
  66                         break;
  67                 }
  68                 else if(token.str=="{")
  69                         sub = true;
  70                 else if(token.str==";")
  71                         break;
  72                 else if(token.type==Token::INTEGER)
  73                         result.append(lexical_cast<IntType::Store>(token.str));
  74                 else if(token.type==Token::FLOAT)
  75                         result.append(lexical_cast<FloatType::Store>(token.str));
  76                 else if(token.type==Token::STRING)
  77                         result.append(token.str);
  78                 else if(token.type==Token::IDENTIFIER)
  79                 {
  80                         if(token.str=="true")
  81                                 result.append(true);
  82                         else if(token.str=="false")
  83                                 result.append(false);
  84                         else
  85                                 result.append(Symbol(token.str));
  86                 }
  87                 else if(token.str=="")
  88                         throw_at(ParseError("Unexcepted end of input"), get_location());
  89                 else
  90                         throw_at(ParseError("Syntax error"), get_location());
  91         }
  92
  93         return result;
  94 }
  95
  96 Token TextParser::parse_token()
  97 {
  98         int c = 0;
  99         int comment = 0;
 100
 101         // Skip over comments and whitespace
 102         while(in && comment>=0)
 103         {
 104                 c = in.get();
 105                 int next = in.peek();
 106
 107                 if(c=='/' && next=='/')
 108                         comment = 1;
 109                 else if(c=='/' && next=='*')
 110                         comment = 2;
 111                 else if(c=='\n' && comment==1)
 112                         comment = 0;
 113                 else if(c=='*' && next=='/' && comment==2)
 114                         comment = 3;
 115                 else if(comment==3)   // Skip the second character of block comment end
 116                         comment = 0;
 117                 else if(c!=-1 && !isspace(c) && !comment)
 118                         comment = -1;
 119         }
 120
 121         if(comment>0)  // EOF while in comment
 122                 throw_at(ParseError("Unfinished comment at end of input"), get_location());
 123         else if(comment==0)  // Didn't hit any non-whitespace
 124                 return Token(Token::SPECIAL, "");
 125
 126         enum ParseState
 127         {
 128                 INIT,
 129                 SIGN,
 130                 FLOATEXPINIT,
 131                 FLOATEXPSIGN,
 132                 STRING,
 133                 ACCEPT,
 134                 ZERO,
 135                 DECIMAL,
 136                 HEXADECIMAL,
 137                 OCTAL,
 138                 FLOAT,
 139                 FLOATEXP,
 140                 STRING_END,
 141                 IDENTIFIER
 142         };
 143
 144         static Token::Type token_type[]=
 145         {
 146                 Token::SPECIAL,
 147                 Token::SPECIAL,
 148                 Token::SPECIAL,
 149                 Token::SPECIAL,
 150                 Token::SPECIAL,
 151                 Token::SPECIAL,
 152                 Token::INTEGER,
 153                 Token::INTEGER,
 154                 Token::INTEGER,
 155                 Token::INTEGER,
 156                 Token::FLOAT,
 157                 Token::FLOAT,
 158                 Token::STRING,
 159                 Token::IDENTIFIER
 160         };
 161
 162         ParseState state = INIT;
 163         string     buf;
 164         bool       escape = false;
 165
 166         while(in || state==INIT)
 167         {
 168                 if(state!=INIT)
 169                         c = in.get();
 170                 int next = in.peek();
 171
 172                 buf += c;
 173
 174                 switch(state)
 175                 {
 176                 case INIT:
 177                         if(c=='0')
 178                                 state = ZERO;
 179                         else if(c=='-' || c=='+')
 180                                 state = SIGN;
 181                         else if(c=='.')
 182                                 state = FLOAT;
 183                         else if(c=='"')
 184                                 state = STRING;
 185                         else if(c=='{' || c=='}' || c==';')
 186                                 return Token(Token::SPECIAL, string(1, c));
 187                         else if(isdigit(c))
 188                                 state = DECIMAL;
 189                         else if(isalpha(c) || c=='_' || c=='\\')
 190                                 state = IDENTIFIER;
 191                         else
 192                                 parse_error(c, "0-9A-Za-z_\\.\"{};+-");
 193                         break;
 194
 195                 case SIGN:
 196                         if(c=='0')
 197                                 state = ZERO;
 198                         else if(isdigit(c))
 199                                 state = DECIMAL;
 200                         else if(c=='.')
 201                                 state = FLOAT;
 202                         else
 203                                 parse_error(c, "0-9.");
 204                         break;
 205
 206                 case ZERO:
 207                         if(c=='x')
 208                                 state = HEXADECIMAL;
 209                         else if(isdigit(c))
 210                                 state = OCTAL;
 211                         else if(c=='.')
 212                                 state = FLOAT;
 213                         else
 214                                 parse_error(c, "0-9A-Fa-f.");
 215                         break;
 216
 217                 case DECIMAL:
 218                         if(c=='.')
 219                                 state = FLOAT;
 220                         else if(!isdigit(c))
 221                                 parse_error(c, "0-9.");
 222                         break;
 223
 224                 case HEXADECIMAL:
 225                         if(!isxdigit(c))
 226                                 parse_error(c, "0-9A-Fa-f");
 227                         break;
 228
 229                 case OCTAL:
 230                         if(!isodigit(c))
 231                                 parse_error(c, "0-7");
 232                         break;
 233
 234                 case FLOAT:
 235                         if(c=='e' || c=='E')
 236                                 state = FLOATEXPINIT;
 237                         else if(!isdigit(c))
 238                                 parse_error(c, "0-9Ee");
 239                         break;
 240
 241                 case FLOATEXPINIT:
 242                         if(c=='+' || c=='-')
 243                                 state = FLOATEXPSIGN;
 244                         else if(isdigit(c))
 245                                 state = FLOATEXP;
 246                         else
 247                                 parse_error(c, "0-9+-");
 248                         break;
 249
 250                 case FLOATEXPSIGN:
 251                         if(isdigit(c))
 252                                 state = FLOATEXP;
 253                         else
 254                                 parse_error(c, "0-9");
 255                         break;
 256
 257                 case FLOATEXP:
 258                         if(!isdigit(c))
 259                                 parse_error(c, "0-9");
 260                         break;
 261
 262                 case STRING:
 263                         if(c=='\\')
 264                                 escape = !escape;
 265                         else if(c=='"' && !escape)
 266                                 state = STRING_END;
 267                         else
 268                                 escape = false;
 269                         break;
 270
 271                 case IDENTIFIER:
 272                         if(!isalpha(c) && !isdigit(c) && c!='_' && c!='-' && c!='/')
 273                                 parse_error(c, "0-9A-Za-z_/-");
 274                         break;
 275
 276                 case STRING_END:
 277                         throw_at(ParseError("Garbage after string"), get_location());
 278
 279                 default:
 280                         throw_at(InvalidState("Internal error (bad state)"), get_location());
 281                 }
 282
 283                 if(is_delimiter(next) && state>=ACCEPT)
 284                 {
 285                         if(state==IDENTIFIER && buf[0]=='\\')
 286                                 return Token(Token::IDENTIFIER, buf.substr(1));
 287                         else if(state==STRING_END)
 288                         {
 289                                 try
 290                                 {
 291                                         return Token(Token::STRING, c_unescape(buf.substr(1, buf.size()-2)));
 292                                 }
 293                                 catch(Exception &e)
 294                                 {
 295                                         e.at(get_location());
 296                                         throw;
 297                                 }
 298                         }
 299                         else
 300                                 return Token(token_type[state], buf);
 301                 }
 302         }
 303
 304         return Token(Token::SPECIAL, "");
 305 }
 306
 307 bool TextParser::is_delimiter(int c)
 308 {
 309         return (isspace(c) || c=='{' || c=='}' || c==';' || c=='/');
 310 }
 311
 312 bool TextParser::isodigit(int c)
 313 {
 314         return (c>='0' && c<='7');
 315 }
 316
 317 string TextParser::get_location()
 318 {
 319         ostringstream ss;
 320         ss<<src<<':'<<in.get_line_number();
 321         return ss.str();
 322 }
 323
 324 void TextParser::parse_error(int c, const char *e)
 325 {
 326         throw_at(ParseError(format("Parse error at '%c', expected one of \"%s\"", static_cast<char>(c), e)), get_location());
 327 }
 328
 329 } // namespace DataFile
 330 } // namespace Msp