00001
00002
00003
00004
00005
00006 #include "lexer.h"
00007
00008 #include <string.h>
00009 #include <ctype.h>
00010 #include <stdlib.h>
00011 #include <assert.h>
00012
00013 #include "fityk.h"
00014 #include "common.h"
00015
00016 using namespace std;
00017 using fityk::SyntaxError;
00018
00019 string Lexer::get_string(const Token& token)
00020 {
00021 switch (token.type) {
00022 case kTokenString:
00023 return string(token.str+1, token.length - 2);
00024 case kTokenVarname:
00025 return string(token.str+1, token.length - 1);
00026 case kTokenFuncname:
00027 return string(token.str+1, token.length - 1);
00028 default:
00029
00030 return token.as_string();
00031 }
00032 }
00033
00034 const char* tokentype2str(TokenType tt)
00035 {
00036 switch (tt) {
00037 case kTokenLname: return "lower_case_name";
00038 case kTokenCname: return "CamelCaseName";
00039 case kTokenUletter: return "Upper-case-letter";
00040 case kTokenString: return "'quoted-string'";
00041 case kTokenVarname: return "$variable_name";
00042 case kTokenFuncname: return "%func_name";
00043 case kTokenNumber: return "number";
00044 case kTokenDataset: return "@dataset";
00045 case kTokenFilename: return "filename";
00046 case kTokenExpr: return "expr";
00047 case kTokenEVar: return "var-expr";
00048 case kTokenRest: return "rest-of-line";
00049
00050 case kTokenLE: return "<=";
00051 case kTokenGE: return ">=";
00052 case kTokenNE: return "!=";
00053 case kTokenEQ: return "==";
00054 case kTokenAppend: return ">>";
00055 case kTokenDots: return "..";
00056 case kTokenPlusMinus: return "+-";
00057 case kTokenAddAssign: return "+=";
00058 case kTokenSubAssign: return "-=";
00059
00060 case kTokenOpen: return "(";
00061 case kTokenClose: return ")";
00062 case kTokenLSquare: return "[";
00063 case kTokenRSquare: return "]";
00064 case kTokenLCurly: return "{";
00065 case kTokenRCurly: return "}";
00066 case kTokenPlus: return "+";
00067 case kTokenMinus: return "-";
00068 case kTokenMult: return "*";
00069 case kTokenDiv: return "/";
00070 case kTokenPower: return "^";
00071 case kTokenLT: return "<";
00072 case kTokenGT: return ">";
00073 case kTokenAssign: return "=";
00074 case kTokenComma: return ",";
00075 case kTokenSemicolon: return ";";
00076 case kTokenDot: return ".";
00077 case kTokenColon: return ":";
00078 case kTokenTilde: return "~";
00079 case kTokenQMark: return "?";
00080 case kTokenBang: return "!";
00081
00082 case kTokenNop: return "Nop";
00083 }
00084 return NULL;
00085 }
00086
00087 string token2str(const Token& token)
00088 {
00089 string s = tokentype2str(token.type);
00090 switch (token.type) {
00091 case kTokenString:
00092 case kTokenVarname:
00093 case kTokenFuncname:
00094 case kTokenLname:
00095 case kTokenCname:
00096 case kTokenUletter:
00097 case kTokenFilename:
00098 case kTokenRest:
00099 return s + " \"" + token.as_string() + "\"";
00100 case kTokenExpr:
00101 return s + " \"" + token.as_string() + "\" ("+S(token.value.d)+")";
00102 case kTokenEVar:
00103 return s + " \"" + token.as_string() + "\"";
00104 case kTokenNumber:
00105 return s + " " + S(token.value.d);
00106 case kTokenDataset:
00107 if (token.value.i == Lexer::kAll)
00108 return s + " '*'";
00109 else if (token.value.i == Lexer::kNew)
00110 return s + " '+'";
00111 else
00112 return s + " " + S(token.value.i);
00113 default:
00114 return s;
00115 }
00116 }
00117
00118 void Lexer::read_token(bool allow_glob)
00119 {
00120 tok_.str = cur_;
00121 while (isspace(*tok_.str))
00122 ++tok_.str;
00123 const char* ptr = tok_.str;
00124
00125 switch (*ptr) {
00126 case '\0':
00127 case '#':
00128 tok_.type = kTokenNop;
00129 break;
00130 case '\'': {
00131 tok_.type = kTokenString;
00132 const char* end = strchr(ptr + 1, '\'');
00133 if (end == NULL)
00134 throw SyntaxError("unfinished string");
00135 ptr = end + 1;
00136 break;
00137 }
00138 case '>':
00139 ++ptr;
00140 if (*ptr == '=') {
00141 tok_.type = kTokenGE;
00142 ++ptr;
00143 }
00144 else if (*ptr == '>') {
00145 tok_.type = kTokenAppend;
00146 ++ptr;
00147 }
00148 else
00149 tok_.type = kTokenGT;
00150 break;
00151 case '<':
00152 ++ptr;
00153 if (*ptr == '=') {
00154 tok_.type = kTokenLE;
00155 ++ptr;
00156 }
00157 else if (*ptr == '>') {
00158 tok_.type = kTokenNE;
00159 ++ptr;
00160 }
00161 else
00162 tok_.type = kTokenLT;
00163 break;
00164 case '=':
00165 ++ptr;
00166 if (*ptr == '=') {
00167 tok_.type = kTokenEQ;
00168 ++ptr;
00169 }
00170 else
00171 tok_.type = kTokenAssign;
00172 break;
00173 case '+':
00174 ++ptr;
00175 if (*ptr == '-') {
00176 tok_.type = kTokenPlusMinus;
00177 ++ptr;
00178 }
00179 else if (*ptr == '=') {
00180 tok_.type = kTokenAddAssign;
00181 ++ptr;
00182 }
00183 else
00184 tok_.type = kTokenPlus;
00185 break;
00186 case '-':
00187 ++ptr;
00188 if (*ptr == '=') {
00189 tok_.type = kTokenSubAssign;
00190 ++ptr;
00191 }
00192 else
00193 tok_.type = kTokenMinus;
00194 break;
00195
00196 case '!':
00197 ++ptr;
00198 if (*ptr == '=') {
00199 tok_.type = kTokenNE;
00200 ++ptr;
00201 }
00202 else
00203 tok_.type = kTokenBang;
00204 break;
00205
00206 case '.':
00207 ++ptr;
00208 if (isdigit(*ptr)) {
00209 char* endptr;
00210 tok_.value.d = strtod(ptr-1, &endptr);
00211 ptr = endptr;
00212 tok_.type = kTokenNumber;
00213 }
00214 else if (*ptr == '.') {
00215 ++ptr;
00216 if (*ptr == '.')
00217 ++ptr;
00218 tok_.type = kTokenDots;
00219 }
00220 else
00221 tok_.type = kTokenDot;
00222 break;
00223 case '@':
00224 ++ptr;
00225 tok_.type = kTokenDataset;
00226 if (*ptr == '*') {
00227 tok_.value.i = kAll;
00228 ++ptr;
00229 }
00230 else if (*ptr == '+') {
00231 tok_.value.i = kNew;
00232 ++ptr;
00233 }
00234 else if (isdigit(*ptr)) {
00235 char *endptr;
00236 tok_.value.i = strtol(ptr, &endptr, 10);
00237 ptr = endptr;
00238 }
00239 else
00240 throw SyntaxError("unexpected character after '@'");
00241 break;
00242 case '$':
00243 ++ptr;
00244
00245
00246
00247 if (! (isalpha(*ptr) || *ptr == '_' || *ptr == '*'))
00248 throw SyntaxError("unexpected character after '$'");
00249 tok_.type = kTokenVarname;
00250 while (isalnum(*ptr) || *ptr == '_' || (allow_glob && *ptr == '*'))
00251 ++ptr;
00252 break;
00253 case '%':
00254 ++ptr;
00255
00256 if (! (isalpha(*ptr) || *ptr == '_' || *ptr == '*'))
00257 throw SyntaxError("unexpected character after '%'");
00258 tok_.type = kTokenFuncname;
00259 while (isalnum(*ptr) || *ptr == '_' || (allow_glob && *ptr == '*'))
00260 ++ptr;
00261 break;
00262
00263 case '(': tok_.type = kTokenOpen; ++ptr; break;
00264 case ')': tok_.type = kTokenClose; ++ptr; break;
00265 case '[': tok_.type = kTokenLSquare; ++ptr; break;
00266 case ']': tok_.type = kTokenRSquare; ++ptr; break;
00267 case '{': tok_.type = kTokenLCurly; ++ptr; break;
00268 case '}': tok_.type = kTokenRCurly; ++ptr; break;
00269 case '*': tok_.type = kTokenMult; ++ptr; break;
00270 case '/': tok_.type = kTokenDiv; ++ptr; break;
00271 case '^': tok_.type = kTokenPower; ++ptr; break;
00272 case ',': tok_.type = kTokenComma; ++ptr; break;
00273 case ';': tok_.type = kTokenSemicolon; ++ptr; break;
00274 case ':': tok_.type = kTokenColon; ++ptr; break;
00275 case '~': tok_.type = kTokenTilde; ++ptr; break;
00276 case '?': tok_.type = kTokenQMark; ++ptr; break;
00277
00278 default:
00279 if (isdigit(*ptr)) {
00280 char* endptr;
00281 tok_.value.d = strtod(ptr, &endptr);
00282 ptr = endptr;
00283 tok_.type = kTokenNumber;
00284 }
00285 else if (isupper(*ptr)) {
00286 ++ptr;
00287 if (isalnum(*ptr)) {
00288 while (isalnum(*ptr))
00289 ++ptr;
00290 tok_.type = kTokenCname;
00291 }
00292 else
00293 tok_.type = kTokenUletter;
00294 }
00295 else if (isalpha(*ptr) || *ptr == '_') {
00296 while (isalnum(*ptr) || *ptr == '_')
00297 ++ptr;
00298 tok_.type = kTokenLname;
00299 }
00300 else
00301 throw SyntaxError("unexpected character: " + string(ptr, 1));
00302 }
00303 tok_.length = ptr - tok_.str;
00304 cur_ = ptr;
00305 }
00306
00307 Token Lexer::get_token()
00308 {
00309 if (!peeked_)
00310 read_token();
00311 peeked_ = false;
00312 return tok_;
00313 }
00314
00315 const Token& Lexer::peek_token()
00316 {
00317 if (!peeked_)
00318 read_token();
00319 peeked_ = true;
00320 return tok_;
00321 }
00322
00323 void Lexer::go_back(const Token& token)
00324 {
00325 cur_ = token.str;
00326 peeked_ = false;
00327 }
00328
00329 Token Lexer::get_glob_token()
00330 {
00331 if (peeked_) {
00332
00333 cur_ = tok_.str;
00334 peeked_ = false;
00335 }
00336 read_token(true);
00337 return tok_;
00338 }
00339
00340 Token Lexer::get_filename_token()
00341 {
00342 Token t = get_token();
00343 if (t.type == kTokenString || t.type == kTokenNop)
00344 return t;
00345 while (*cur_ != '\0' && !isspace(*cur_) && *cur_ != ';' && *cur_ != '#')
00346 ++cur_;
00347 t.type = kTokenFilename;
00348 t.length = cur_ - t.str;
00349 return t;
00350 }
00351
00352 Token Lexer::get_rest_of_line()
00353 {
00354 Token t = get_token();
00355 while (*cur_ != '\0')
00356 ++cur_;
00357 t.type = kTokenRest;
00358 t.length = cur_ - t.str;
00359 return t;
00360 }
00361
00362 Token Lexer::get_expected_token(const string& raw)
00363 {
00364 TokenType p = peek_token().type;
00365 string s = peek_token().as_string();
00366 if (s != raw) {
00367 string msg = "expected `" + raw + "'";
00368 throw_syntax_error(p == kTokenNop ? msg
00369 : msg + " instead of `" + s + "'");
00370 }
00371 return get_token();
00372 }
00373
00374 Token Lexer::get_expected_token(TokenType tt)
00375 {
00376 TokenType p = peek_token().type;
00377 if (p != tt) {
00378 string msg = S("expected ") + tokentype2str(tt);
00379 throw_syntax_error(p == kTokenNop ? msg
00380 : msg + " instead of " + tokentype2str(p));
00381 }
00382 return get_token();
00383 }
00384
00385 Token Lexer::get_expected_token(TokenType tt1, TokenType tt2)
00386 {
00387 TokenType p = peek_token().type;
00388 if (p != tt1 && p != tt2) {
00389 string msg = S("expected ") + tokentype2str(tt1)
00390 + " or " + tokentype2str(tt2);
00391 throw_syntax_error(p == kTokenNop ? msg
00392 : msg + " instead of " + tokentype2str(p));
00393 }
00394 return get_token();
00395 }
00396
00397 Token Lexer::get_expected_token(TokenType tt, const string& raw)
00398 {
00399 TokenType p = peek_token().type;
00400 string s = peek_token().as_string();
00401 if (p != tt && s != raw) {
00402 string msg = S("expected ") + tokentype2str(tt) + " or `" + raw + "'";
00403 throw_syntax_error(p == kTokenNop ? msg
00404 : msg + " instead of `" + s + "'");
00405 }
00406 return get_token();
00407 }
00408
00409 Token Lexer::get_expected_token(const string& raw1, const string& raw2)
00410 {
00411 TokenType p = peek_token().type;
00412 string s = peek_token().as_string();
00413 if (s != raw1 && s != raw2) {
00414 string msg = "expected `" + raw1 + "' or `" + raw2 + "'";
00415 throw_syntax_error(p == kTokenNop ? msg
00416 : msg + " instead of `" + s + "'");
00417 }
00418 return get_token();
00419 }
00420
00421 Token Lexer::get_token_if(TokenType tt)
00422 {
00423 if (peek_token().type == tt)
00424 return get_token();
00425 else {
00426 Token token;
00427 token.type = kTokenNop;
00428 token.str = cur_;
00429 token.length = 0;
00430 return token;
00431 }
00432 }
00433
00434 void Lexer::throw_syntax_error(const string& msg)
00435 {
00436 int pos = cur_ - input_;
00437 string s = S(pos);
00438 if (pos >= 10)
00439 s += ", near `" + string(cur_ - 10, cur_) + "'";
00440 throw SyntaxError("at " + s + ": " + msg);
00441 }
00442