File indexing completed on 2024-05-12 15:43:24
0001 /* 0002 * This file is part of the KDE libraries 0003 * Copyright (C) 2012 Bernd Buschinski (b.buschinski@googlemail.com) 0004 * 0005 * This library is free software; you can redistribute it and/or 0006 * modify it under the terms of the GNU Library General Public 0007 * License as published by the Free Software Foundation; either 0008 * version 2 of the License, or (at your option) any later version. 0009 * 0010 * This library is distributed in the hope that it will be useful, 0011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0013 * Library General Public License for more details. 0014 * 0015 * You should have received a copy of the GNU Library General Public License 0016 * along with this library; see the file COPYING.LIB. If not, write to 0017 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0018 * Boston, MA 02110-1301, USA. 0019 * 0020 */ 0021 0022 #include "jsonlexer.h" 0023 0024 #include <stack> 0025 0026 #include "lexer.h" 0027 #include "object.h" 0028 0029 #include "wtf/Assertions.h" 0030 0031 // #define JSONLEXER_DEBUG_VERBOSE 0032 0033 namespace KJS 0034 { 0035 0036 using namespace JSONParserState; 0037 0038 static const unsigned short InvalidJSONUnicode = 0x001F; 0039 0040 static inline bool isDecimalDigit(const UChar &c) 0041 { 0042 return (c.uc >= '0' && c.uc <= '9'); 0043 } 0044 0045 static inline bool isHexDigit(const UChar &c) 0046 { 0047 return (isDecimalDigit(c) || 0048 (c.uc >= 'a' && c.uc <= 'f') || 0049 (c.uc >= 'A' && c.uc <= 'F')); 0050 } 0051 0052 static inline bool isJSONWhiteSpace(const UChar &c) 0053 { 0054 //ECMA Edition 5.1r6 - 15.12.1.1 - Syntax 0055 switch (c.uc) { 0056 case 0x0020: //SP 0057 case 0x0009: //TAB 0058 case 0x000A: //LF 0059 case 0x000D: //CR 0060 return true; 0061 default: 0062 return false; 0063 } 0064 } 0065 0066 #ifdef JSONLEXER_DEBUG_VERBOSE 0067 static inline UString tokenToString(TokenType type) 0068 { 0069 switch (type) { 0070 case TokLBracket: return UString("TokLBracket"); 0071 case TokRBracket: return UString("TokRBracket"); 0072 case TokLBrace: return UString("TokLBrace"); 0073 case TokRBrace: return UString("TokRBrace"); 0074 case TokString: return UString("TokString"); 0075 case TokIdentifier: return UString("TokIdentifier"); 0076 case TokNumber: return UString("TokNumber"); 0077 case TokColon: return UString("TokColon"); 0078 case TokLParen: return UString("TokLParen"); 0079 case TokRParen: return UString("TokRParen"); 0080 case TokComma: return UString("TokComma"); 0081 case TokTrue: return UString("TokTrue"); 0082 case TokFalse: return UString("TokFalse"); 0083 case TokNull: return UString("TokNull"); 0084 case TokEnd: return UString("TokEnd"); 0085 case TokError: return UString("TokError"); 0086 } 0087 ASSERT_NOT_REACHED(); 0088 return UString("Default"); 0089 } 0090 0091 static inline UString parserStateToString(ParserState state) 0092 { 0093 switch (state) { 0094 case JSONValue: return UString("JSONValue"); 0095 case JSONObject: return UString("JSONObject"); 0096 case JSONArray: return UString("JSONArray"); 0097 } 0098 ASSERT_NOT_REACHED(); 0099 return UString("Default"); 0100 } 0101 #endif 0102 0103 // ------------------------------ JSONParser -------------------------------- 0104 0105 JSONParser::JSONParser(const UString &code) 0106 : m_lexer(code) 0107 { 0108 #ifdef JSONLEXER_DEBUG_VERBOSE 0109 fprintf(stderr, "=============== new JSONParser ===============\n%s\n===============\n", code.ascii()); 0110 #endif 0111 } 0112 0113 JSValue *JSONParser::tryParse(ExecState *exec) 0114 { 0115 JSValue *ret = parse(exec); 0116 // If the syntax is correct, we never see the EOF, the last used token may be '}'. 0117 // But Syntax like "{} xyz" is also invalid, so we have to check if the next(last) token is EOF 0118 if (ret && nextParseIsEOF()) { 0119 return ret; 0120 } 0121 return nullptr; 0122 } 0123 0124 // helper function for adding a value to the object. 0125 // the arrayStack saves all added values and gives the correct array position. 0126 // This function will return false for NULL value or on exception. 0127 static inline bool addArrayItem(ExecState *exec, std::stack<JSValue *> *arrayStack, JSValue *value, JSObject *object) 0128 { 0129 if (exec->hadException()) { 0130 return false; 0131 } 0132 0133 if (!value) { 0134 return false; 0135 } 0136 0137 arrayStack->push(value); 0138 object->put(exec, arrayStack->size() - 1, value); 0139 return true; 0140 } 0141 0142 JSValue *JSONParser::parse(ExecState *exec, ParserState state) 0143 { 0144 if (exec->hadException()) { 0145 return nullptr; 0146 } 0147 0148 ParserState tState = state; 0149 TokenType type = m_lexer.next(); 0150 0151 JSObject *object = nullptr; 0152 std::stack<JSValue *> arrayObjectStack; 0153 UString propertyName; 0154 0155 // For parsing the Object, did we found a propertyName? 0156 // NOTE: empty propertynames are allowed. 0157 bool havePropertyName = false; 0158 // For parsing the Object/Array, checks if we really added/found a propertyName 0159 // before we find the comma ',' 0160 bool propAdded = false; 0161 // For parsing the Array, remember if last found token is Comma 0162 bool lastFoundIsTokComma = false; 0163 0164 while (type != TokEnd && type != TokError) { 0165 #ifdef JSONLEXER_DEBUG_VERBOSE 0166 fprintf(stderr, "TokenType: %s \t State: %s\n", tokenToString(type).ascii(), parserStateToString(tState).ascii()); 0167 #endif 0168 0169 switch (tState) { 0170 case JSONValue: 0171 switch (type) { 0172 case TokLBracket: 0173 object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty())); 0174 tState = JSONArray; 0175 break; 0176 case TokLBrace: 0177 object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty())); 0178 tState = JSONObject; 0179 break; 0180 case TokString: 0181 return jsString(m_lexer.currentString()); 0182 case TokNull: 0183 return jsNull(); 0184 case TokTrue: 0185 return jsBoolean(true); 0186 case TokFalse: 0187 return jsBoolean(false); 0188 case TokNumber: 0189 return jsNumber(m_lexer.currentNumber()); 0190 default: 0191 // This can only happen on invalid syntax and with 0 return 0192 // we tell the caller we got a syntax error. 0193 0194 // ASSERT_NOT_REACHED(); 0195 return nullptr; 0196 } 0197 break; 0198 case JSONObject: { 0199 // if we got called from JSONArray-TokLBrace we did not create an object. 0200 0201 // In more detail for the following JSON String "[{}]" 0202 // If we are in parse with type=JSONArray and state=TokLBrace, 0203 // means we just found the "{" in the Array, and call parse(exec, JSONObject), 0204 // now in this call type=JSONObject, state=TokRBrace ("}") and, our new, object=0 (!) 0205 // We will finish the object and return it, but as object is null, we return 0. 0206 // which would be wrong, as empty objects are allowed. 0207 // In this case we just report invalid data. 0208 // But for JSON String like "[{"a":1}]", we end up using object(0)->putDirect 0209 // and crash. 0210 0211 // In short, remove this line and we will crash. 0212 object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty())); 0213 switch (type) { 0214 case TokString: // PropertyName 0215 if (havePropertyName) { 0216 return nullptr; 0217 } 0218 propertyName = m_lexer.currentString(); 0219 havePropertyName = true; 0220 break; 0221 case TokColon: { 0222 if (!havePropertyName) { 0223 return nullptr; 0224 } 0225 JSValue *val = parse(exec, JSONValue); 0226 if (!val) { 0227 return nullptr; 0228 } 0229 0230 // use putDirect to by-pass __proto__ 0231 object->putDirect(Identifier(propertyName), val); 0232 propertyName = ""; 0233 havePropertyName = false; 0234 propAdded = true; 0235 break; 0236 } 0237 case TokRBrace: //Finish Object 0238 if (havePropertyName) { 0239 return nullptr; 0240 } 0241 return object; 0242 case TokComma: // Next Property 0243 if (!propAdded) { 0244 return nullptr; 0245 } 0246 propAdded = false; 0247 break; 0248 default: 0249 // This can only happen on invalid syntax and with 0 return 0250 // we tell the caller we got a syntax error. 0251 0252 // ASSERT_NOT_REACHED(); 0253 return nullptr; 0254 } 0255 break; 0256 } 0257 case JSONArray: { 0258 // if we got called from JSONArray-TokLBracket we did not create an object 0259 object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty())); 0260 0261 // Check for invalid Array syntax, like ["1" "2"] 0262 switch (type) { 0263 case TokNumber: 0264 case TokString: 0265 case TokNull: 0266 case TokTrue: 0267 case TokFalse: 0268 case TokLBrace: 0269 case TokLBracket: 0270 if (propAdded) { 0271 return nullptr; 0272 } 0273 propAdded = true; 0274 lastFoundIsTokComma = false; 0275 break; 0276 default: 0277 break; 0278 } 0279 0280 switch (type) { 0281 case TokRBracket: // Finish array 0282 // Check for invalid syntax like "[1,]" 0283 if (lastFoundIsTokComma) { 0284 return nullptr; 0285 } 0286 return object; 0287 case TokNumber: 0288 if (!addArrayItem(exec, &arrayObjectStack, jsNumber(m_lexer.currentNumber()), object)) { 0289 return nullptr; 0290 } 0291 break; 0292 case TokString: 0293 if (!addArrayItem(exec, &arrayObjectStack, jsString(m_lexer.currentString()), object)) { 0294 return nullptr; 0295 } 0296 break; 0297 case TokNull: 0298 if (!addArrayItem(exec, &arrayObjectStack, jsNull(), object)) { 0299 return nullptr; 0300 } 0301 break; 0302 case TokTrue: 0303 if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(true), object)) { 0304 return nullptr; 0305 } 0306 break; 0307 case TokFalse: 0308 if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(false), object)) { 0309 return nullptr; 0310 } 0311 break; 0312 case TokLBrace: 0313 if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONObject), object)) { 0314 return nullptr; 0315 } 0316 break; 0317 case TokLBracket: 0318 if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONArray), object)) { 0319 return nullptr; 0320 } 0321 break; 0322 case TokComma: // Skip Comma and parse next Array Element 0323 // if we found a comma without a leading property, this is invalid syntax 0324 if (!propAdded) { 0325 return nullptr; 0326 } 0327 propAdded = false; 0328 lastFoundIsTokComma = true; 0329 break; 0330 default: 0331 // This can only happen on invalid syntax and with 0 return 0332 // we tell the caller we got a syntax error. 0333 0334 // ASSERT_NOT_REACHED(); 0335 return nullptr; 0336 } 0337 break; 0338 } 0339 default: 0340 ASSERT_NOT_REACHED(); 0341 return nullptr; 0342 } 0343 type = m_lexer.next(); 0344 } 0345 0346 if (type == TokError) { 0347 #ifdef JSONLEXER_DEBUG_VERBOSE 0348 fprintf(stderr, "WARNING: JSONParse ending with error!\n"); 0349 #endif 0350 return nullptr; 0351 } 0352 if (type == TokEnd) { 0353 #ifdef JSONLEXER_DEBUG_VERBOSE 0354 fprintf(stderr, "WARNING: JSONParse ending with unexpected END!\n"); 0355 #endif 0356 return nullptr; 0357 } 0358 ASSERT_NOT_REACHED(); 0359 return nullptr; 0360 } 0361 0362 // ------------------------------ JSONLexer -------------------------------- 0363 0364 JSONLexer::JSONLexer(const UString &code) 0365 : m_code(code), 0366 m_pos(0) 0367 { 0368 } 0369 0370 TokenType JSONLexer::current() 0371 { 0372 return m_type; 0373 } 0374 0375 double JSONLexer::currentNumber() const 0376 { 0377 ASSERT(m_type == TokNumber); 0378 return m_numberToken; 0379 } 0380 0381 UString JSONLexer::currentString() const 0382 { 0383 ASSERT(m_type == TokString); 0384 return m_stringToken; 0385 } 0386 0387 TokenType JSONLexer::lexString() 0388 { 0389 UString string; 0390 const int codeSize = m_code.size(); 0391 //skip first detected '"' 0392 ++m_pos; 0393 0394 if (m_pos >= codeSize) { 0395 m_type = TokError; 0396 return m_type; 0397 } 0398 0399 //while not at the end of the string '"' 0400 while (!(m_code[m_pos] == '"')) { 0401 UChar cur = m_code[m_pos]; 0402 if (cur == UChar('\\')) { 0403 ++m_pos; 0404 bool error = false; 0405 string.append(parseEscapeChar(&error)); 0406 if (error) { 0407 m_type = TokError; 0408 return m_type; 0409 } 0410 } else { 0411 if (cur.uc <= InvalidJSONUnicode) { 0412 m_type = TokError; 0413 return m_type; 0414 } 0415 string.append(cur); 0416 ++m_pos; 0417 } 0418 0419 if (m_pos >= codeSize) { 0420 m_type = TokError; 0421 return m_type; 0422 } 0423 } 0424 0425 m_type = TokString; 0426 m_stringToken = string; 0427 ++m_pos; 0428 #ifdef JSONLEXER_DEBUG_VERBOSE 0429 fprintf(stderr, "JSONLexer::lexString: Pos:%d stringlength:%d string:%s\n", m_pos, string.size(), string.ascii()); 0430 #endif 0431 return m_type; 0432 } 0433 0434 TokenType JSONLexer::lexNumber() 0435 { 0436 const int start = m_pos; 0437 const int codeSize = m_code.size(); 0438 0439 // -?(0 | [1-9][0-9]*) ('.' [0-9]+)? ([eE][+-]? [0-9]+)? 0440 0441 // -? 0442 if (m_pos < codeSize && m_code[m_pos] == '-') { 0443 ++m_pos; 0444 } 0445 0446 // (0 | [1-9][0-9]*) 0447 if (m_pos < codeSize && m_code[m_pos] == '0') { 0448 ++m_pos; 0449 } else if (m_pos < codeSize) { 0450 while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) { 0451 ++m_pos; 0452 } 0453 } else { 0454 m_type = TokError; 0455 return m_type; 0456 } 0457 0458 // ('.' [0-9]+)? 0459 if (m_pos < codeSize && m_code[m_pos] == '.') { 0460 ++m_pos; 0461 // [0-9]+ 0462 if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) { 0463 m_type = TokError; 0464 return m_type; 0465 } 0466 ++m_pos; 0467 0468 while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) { 0469 ++m_pos; 0470 } 0471 } 0472 0473 // ([eE][+-]? [0-9]+)? 0474 if (m_pos < codeSize && (m_code[m_pos] == 'e' || m_code[m_pos] == 'E')) { // [eE] 0475 ++m_pos; 0476 0477 // [-+]? 0478 if (m_pos < codeSize && (m_code[m_pos] == '-' || m_code[m_pos] == '+')) { 0479 ++m_pos; 0480 } 0481 0482 // [0-9]+ 0483 if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) { 0484 m_type = TokError; 0485 return m_type; 0486 } 0487 0488 ++m_pos; 0489 while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) { 0490 ++m_pos; 0491 } 0492 } 0493 0494 m_numberToken = m_code.substr(start, m_pos - start).toDouble(false, false); 0495 m_type = TokNumber; 0496 #ifdef JSONLEXER_DEBUG_VERBOSE 0497 fprintf(stderr, "Number: %f\n", m_numberToken); 0498 #endif 0499 return m_type; 0500 } 0501 0502 UChar JSONLexer::parseEscapeChar(bool *error) 0503 { 0504 UChar cur = m_code[m_pos]; 0505 switch (cur.uc) { 0506 case '"': 0507 case '\\': 0508 case '/': 0509 ++m_pos; 0510 return cur; 0511 case 'b': 0512 ++m_pos; 0513 return UChar('\b'); 0514 case 'f': 0515 ++m_pos; 0516 return UChar('\f'); 0517 case 'n': 0518 ++m_pos; 0519 return UChar('\n'); 0520 case 'r': 0521 ++m_pos; 0522 return UChar('\r'); 0523 case 't': 0524 ++m_pos; 0525 return UChar('\t'); 0526 case 'u': { 0527 if ((m_code.size() - (m_pos + 1)) < 4) { 0528 *error = true; 0529 return UChar(' '); 0530 } 0531 if (!isHexDigit(m_code[m_pos + 1]) || !isHexDigit(m_code[m_pos + 2]) || 0532 !isHexDigit(m_code[m_pos + 3]) || !isHexDigit(m_code[m_pos + 4])) { 0533 *error = true; 0534 return UChar(' '); 0535 } 0536 0537 UChar next = Lexer::convertUnicode(m_code[m_pos + 1].uc, m_code[m_pos + 2].uc, m_code[m_pos + 3].uc, m_code[m_pos + 4].uc); 0538 0539 *error = false; 0540 m_pos += 5; 0541 return next; 0542 } 0543 default: 0544 *error = true; 0545 return UChar(' '); 0546 } 0547 } 0548 0549 //helper function, checks if "word" is in the "code" at "pos". 0550 static inline bool isStringSequence(int pos, const UString &code, const UString &word) 0551 { 0552 const int wordSize = word.size(); 0553 if (pos + wordSize > code.size()) { 0554 return false; 0555 } 0556 0557 //Skip first, we already checked it 0558 for (int i = 1; i < wordSize; ++i) { 0559 if (code[pos + i].uc != word[i].uc) { 0560 return false; 0561 } 0562 } 0563 return true; 0564 } 0565 0566 TokenType JSONLexer::next() 0567 { 0568 while (true) { 0569 if (m_pos >= m_code.size()) { 0570 m_type = TokEnd; 0571 return m_type; 0572 } 0573 0574 if (!isJSONWhiteSpace(m_code[m_pos])) { 0575 break; 0576 } 0577 ++m_pos; 0578 } 0579 0580 m_type = TokError; 0581 0582 #ifdef JSONLEXER_DEBUG_VERBOSE 0583 fprintf(stderr, "JSONLexer::next current: %c \t\t pos: %d/%d\n", char(m_code[m_pos].uc), m_pos, m_code.size()); 0584 #endif 0585 0586 switch (m_code[m_pos].uc) { 0587 case '[': 0588 m_type = TokLBracket; 0589 ++m_pos; 0590 return m_type; 0591 case ']': 0592 m_type = TokRBracket; 0593 ++m_pos; 0594 return m_type; 0595 case '(': 0596 m_type = TokLParen; 0597 ++m_pos; 0598 return m_type; 0599 case ')': 0600 m_type = TokRParen; 0601 ++m_pos; 0602 return m_type; 0603 case '{': 0604 m_type = TokLBrace; 0605 ++m_pos; 0606 return m_type; 0607 case '}': 0608 m_type = TokRBrace; 0609 ++m_pos; 0610 return m_type; 0611 case ',': 0612 m_type = TokComma; 0613 ++m_pos; 0614 return m_type; 0615 case ':': 0616 m_type = TokColon; 0617 ++m_pos; 0618 return m_type; 0619 case '"': 0620 return lexString(); 0621 0622 case 't': 0623 if (isStringSequence(m_pos, m_code, "true")) { 0624 m_type = TokTrue; 0625 m_pos += 4; 0626 return m_type; 0627 } 0628 break; 0629 case 'f': 0630 if (isStringSequence(m_pos, m_code, "false")) { 0631 m_type = TokFalse; 0632 m_pos += 5; 0633 return m_type; 0634 } 0635 break; 0636 case 'n': 0637 if (isStringSequence(m_pos, m_code, "null")) { 0638 m_type = TokNull; 0639 m_pos += 4; 0640 return m_type; 0641 } 0642 break; 0643 case '-': 0644 case '0': 0645 case '1': 0646 case '2': 0647 case '3': 0648 case '4': 0649 case '5': 0650 case '6': 0651 case '7': 0652 case '8': 0653 case '9': 0654 return lexNumber(); 0655 } 0656 return m_type; 0657 } 0658 0659 bool JSONParser::nextParseIsEOF() 0660 { 0661 return m_lexer.next() == TokEnd; 0662 } 0663 0664 } // namespace KJS