src/kjs/jsonlexer.cpp

0001 /*
0002  *  This file is part of the KDE libraries
0003  *  Copyright (C) 2012 Bernd Buschinski (b.buschinski@googlemail.com)
0004  *
0005  *  This library is free software; you can redistribute it and/or
0006  *  modify it under the terms of the GNU Library General Public
0007  *  License as published by the Free Software Foundation; either
0008  *  version 2 of the License, or (at your option) any later version.
0009  *
0010  *  This library is distributed in the hope that it will be useful,
0011  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
0012  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0013  *  Library General Public License for more details.
0014  *
0015  *  You should have received a copy of the GNU Library General Public License
0016  *  along with this library; see the file COPYING.LIB.  If not, write to
0017  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0018  *  Boston, MA 02110-1301, USA.
0019  *
0020  */
0021
0022 #include "jsonlexer.h"
0023
0024 #include <stack>
0025
0026 #include "lexer.h"
0027 #include "object.h"
0028
0029 #include "wtf/Assertions.h"
0030
0031 // #define JSONLEXER_DEBUG_VERBOSE
0032
0033 namespace KJS
0034 {
0035
0036 using namespace JSONParserState;
0037
0038 static const unsigned short InvalidJSONUnicode = 0x001F;
0039
0040 static inline bool isDecimalDigit(const UChar &c)
0041 {
0042     return (c.uc >= '0' && c.uc <= '9');
0043 }
0044
0045 static inline bool isHexDigit(const UChar &c)
0046 {
0047     return (isDecimalDigit(c) ||
0048             (c.uc >= 'a' && c.uc <= 'f') ||
0049             (c.uc >= 'A' && c.uc <= 'F'));
0050 }
0051
0052 static inline bool isJSONWhiteSpace(const UChar &c)
0053 {
0054     //ECMA Edition 5.1r6 - 15.12.1.1 - Syntax
0055     switch (c.uc) {
0056     case 0x0020: //SP
0057     case 0x0009: //TAB
0058     case 0x000A: //LF
0059     case 0x000D: //CR
0060         return true;
0061     default:
0062         return false;
0063     }
0064 }
0065
0066 #ifdef JSONLEXER_DEBUG_VERBOSE
0067 static inline UString tokenToString(TokenType type)
0068 {
0069     switch (type) {
0070     case TokLBracket: return UString("TokLBracket");
0071     case TokRBracket: return UString("TokRBracket");
0072     case TokLBrace: return UString("TokLBrace");
0073     case TokRBrace: return UString("TokRBrace");
0074     case TokString: return UString("TokString");
0075     case TokIdentifier: return UString("TokIdentifier");
0076     case TokNumber: return UString("TokNumber");
0077     case TokColon: return UString("TokColon");
0078     case TokLParen: return UString("TokLParen");
0079     case TokRParen: return UString("TokRParen");
0080     case TokComma: return UString("TokComma");
0081     case TokTrue: return UString("TokTrue");
0082     case TokFalse: return UString("TokFalse");
0083     case TokNull: return UString("TokNull");
0084     case TokEnd: return UString("TokEnd");
0085     case TokError:  return UString("TokError");
0086     }
0087     ASSERT_NOT_REACHED();
0088     return UString("Default");
0089 }
0090
0091 static inline UString parserStateToString(ParserState state)
0092 {
0093     switch (state) {
0094     case JSONValue: return UString("JSONValue");
0095     case JSONObject: return UString("JSONObject");
0096     case JSONArray: return UString("JSONArray");
0097     }
0098     ASSERT_NOT_REACHED();
0099     return UString("Default");
0100 }
0101 #endif
0102
0103 // ------------------------------ JSONParser --------------------------------
0104
0105 JSONParser::JSONParser(const UString &code)
0106     : m_lexer(code)
0107 {
0108 #ifdef JSONLEXER_DEBUG_VERBOSE
0109     fprintf(stderr, "=============== new JSONParser ===============\n%s\n===============\n", code.ascii());
0110 #endif
0111 }
0112
0113 JSValue *JSONParser::tryParse(ExecState *exec)
0114 {
0115     JSValue *ret = parse(exec);
0116     // If the syntax is correct, we never see the EOF, the last used token may be '}'.
0117     // But Syntax like "{} xyz" is also invalid, so we have to check if the next(last) token is EOF
0118     if (ret && nextParseIsEOF()) {
0119         return ret;
0120     }
0121     return nullptr;
0122 }
0123
0124 // helper function for adding a value to the object.
0125 // the arrayStack saves all added values and gives the correct array position.
0126 // This function will return false for NULL value or on exception.
0127 static inline bool addArrayItem(ExecState *exec, std::stack<JSValue *> *arrayStack, JSValue *value, JSObject *object)
0128 {
0129     if (exec->hadException()) {
0130         return false;
0131     }
0132
0133     if (!value) {
0134         return false;
0135     }
0136
0137     arrayStack->push(value);
0138     object->put(exec, arrayStack->size() - 1, value);
0139     return true;
0140 }
0141
0142 JSValue *JSONParser::parse(ExecState *exec, ParserState state)
0143 {
0144     if (exec->hadException()) {
0145         return nullptr;
0146     }
0147
0148     ParserState tState = state;
0149     TokenType type = m_lexer.next();
0150
0151     JSObject *object = nullptr;
0152     std::stack<JSValue *> arrayObjectStack;
0153     UString propertyName;
0154
0155     // For parsing the Object, did we found a propertyName?
0156     // NOTE: empty propertynames are allowed.
0157     bool havePropertyName = false;
0158     // For parsing the Object/Array, checks if we really added/found a propertyName
0159     // before we find the comma ','
0160     bool propAdded = false;
0161     // For parsing the Array, remember if last found token is Comma
0162     bool lastFoundIsTokComma = false;
0163
0164     while (type != TokEnd && type != TokError) {
0165 #ifdef JSONLEXER_DEBUG_VERBOSE
0166         fprintf(stderr, "TokenType: %s \t State: %s\n", tokenToString(type).ascii(), parserStateToString(tState).ascii());
0167 #endif
0168
0169         switch (tState) {
0170         case JSONValue:
0171             switch (type) {
0172             case TokLBracket:
0173                 object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
0174                 tState = JSONArray;
0175                 break;
0176             case TokLBrace:
0177                 object = static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
0178                 tState = JSONObject;
0179                 break;
0180             case TokString:
0181                 return jsString(m_lexer.currentString());
0182             case TokNull:
0183                 return jsNull();
0184             case TokTrue:
0185                 return jsBoolean(true);
0186             case TokFalse:
0187                 return jsBoolean(false);
0188             case TokNumber:
0189                 return jsNumber(m_lexer.currentNumber());
0190             default:
0191                 // This can only happen on invalid syntax and with 0 return
0192                 // we tell the caller we got a syntax error.
0193
0194                 // ASSERT_NOT_REACHED();
0195                 return nullptr;
0196             }
0197             break;
0198         case JSONObject: {
0199             // if we got called from JSONArray-TokLBrace we did not create an object.
0200
0201             // In more detail for the following JSON String "[{}]"
0202             // If we are in parse with type=JSONArray and state=TokLBrace,
0203             // means we just found the "{" in the Array, and call parse(exec, JSONObject),
0204             // now in this call type=JSONObject, state=TokRBrace ("}") and, our new, object=0 (!)
0205             // We will finish the object and return it, but as object is null, we return 0.
0206             // which would be wrong, as empty objects are allowed.
0207             // In this case we just report invalid data.
0208             // But for JSON String like "[{"a":1}]", we end up using object(0)->putDirect
0209             // and crash.
0210
0211             // In short, remove this line and we will crash.
0212             object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinObject()->construct(exec, List::empty()));
0213             switch (type) {
0214             case TokString: // PropertyName
0215                 if (havePropertyName) {
0216                     return nullptr;
0217                 }
0218                 propertyName = m_lexer.currentString();
0219                 havePropertyName = true;
0220                 break;
0221             case TokColon: {
0222                 if (!havePropertyName) {
0223                     return nullptr;
0224                 }
0225                 JSValue *val = parse(exec, JSONValue);
0226                 if (!val) {
0227                     return nullptr;
0228                 }
0229
0230                 // use putDirect to by-pass __proto__
0231                 object->putDirect(Identifier(propertyName), val);
0232                 propertyName = "";
0233                 havePropertyName = false;
0234                 propAdded = true;
0235                 break;
0236             }
0237             case TokRBrace: //Finish Object
0238                 if (havePropertyName) {
0239                     return nullptr;
0240                 }
0241                 return object;
0242             case TokComma: // Next Property
0243                 if (!propAdded) {
0244                     return nullptr;
0245                 }
0246                 propAdded = false;
0247                 break;
0248             default:
0249                 // This can only happen on invalid syntax and with 0 return
0250                 // we tell the caller we got a syntax error.
0251
0252                 // ASSERT_NOT_REACHED();
0253                 return nullptr;
0254             }
0255             break;
0256         }
0257         case JSONArray: {
0258             // if we got called from JSONArray-TokLBracket we did not create an object
0259             object = object ? object : static_cast<JSObject *>(exec->lexicalInterpreter()->builtinArray()->construct(exec, List::empty()));
0260
0261             // Check for invalid Array syntax, like ["1" "2"]
0262             switch (type) {
0263             case TokNumber:
0264             case TokString:
0265             case TokNull:
0266             case TokTrue:
0267             case TokFalse:
0268             case TokLBrace:
0269             case TokLBracket:
0270                 if (propAdded) {
0271                     return nullptr;
0272                 }
0273                 propAdded = true;
0274                 lastFoundIsTokComma = false;
0275                 break;
0276             default:
0277                 break;
0278             }
0279
0280             switch (type) {
0281             case TokRBracket: // Finish array
0282                 // Check for invalid syntax like "[1,]"
0283                 if (lastFoundIsTokComma) {
0284                     return nullptr;
0285                 }
0286                 return object;
0287             case TokNumber:
0288                 if (!addArrayItem(exec, &arrayObjectStack, jsNumber(m_lexer.currentNumber()), object)) {
0289                     return nullptr;
0290                 }
0291                 break;
0292             case TokString:
0293                 if (!addArrayItem(exec, &arrayObjectStack, jsString(m_lexer.currentString()), object)) {
0294                     return nullptr;
0295                 }
0296                 break;
0297             case TokNull:
0298                 if (!addArrayItem(exec, &arrayObjectStack, jsNull(), object)) {
0299                     return nullptr;
0300                 }
0301                 break;
0302             case TokTrue:
0303                 if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(true), object)) {
0304                     return nullptr;
0305                 }
0306                 break;
0307             case TokFalse:
0308                 if (!addArrayItem(exec, &arrayObjectStack, jsBoolean(false), object)) {
0309                     return nullptr;
0310                 }
0311                 break;
0312             case TokLBrace:
0313                 if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONObject), object)) {
0314                     return nullptr;
0315                 }
0316                 break;
0317             case TokLBracket:
0318                 if (!addArrayItem(exec, &arrayObjectStack, parse(exec, JSONArray), object)) {
0319                     return nullptr;
0320                 }
0321                 break;
0322             case TokComma: // Skip Comma and parse next Array Element
0323                 // if we found a comma without a leading property, this is invalid syntax
0324                 if (!propAdded) {
0325                     return nullptr;
0326                 }
0327                 propAdded = false;
0328                 lastFoundIsTokComma = true;
0329                 break;
0330             default:
0331                 // This can only happen on invalid syntax and with 0 return
0332                 // we tell the caller we got a syntax error.
0333
0334                 // ASSERT_NOT_REACHED();
0335                 return nullptr;
0336             }
0337             break;
0338         }
0339         default:
0340             ASSERT_NOT_REACHED();
0341             return nullptr;
0342         }
0343         type = m_lexer.next();
0344     }
0345
0346     if (type == TokError) {
0347 #ifdef JSONLEXER_DEBUG_VERBOSE
0348         fprintf(stderr, "WARNING: JSONParse ending with error!\n");
0349 #endif
0350         return nullptr;
0351     }
0352     if (type == TokEnd) {
0353 #ifdef JSONLEXER_DEBUG_VERBOSE
0354         fprintf(stderr, "WARNING: JSONParse ending with unexpected END!\n");
0355 #endif
0356         return nullptr;
0357     }
0358     ASSERT_NOT_REACHED();
0359     return nullptr;
0360 }
0361
0362 // ------------------------------ JSONLexer --------------------------------
0363
0364 JSONLexer::JSONLexer(const UString &code)
0365     : m_code(code),
0366       m_pos(0)
0367 {
0368 }
0369
0370 TokenType JSONLexer::current()
0371 {
0372     return m_type;
0373 }
0374
0375 double JSONLexer::currentNumber() const
0376 {
0377     ASSERT(m_type == TokNumber);
0378     return m_numberToken;
0379 }
0380
0381 UString JSONLexer::currentString() const
0382 {
0383     ASSERT(m_type == TokString);
0384     return m_stringToken;
0385 }
0386
0387 TokenType JSONLexer::lexString()
0388 {
0389     UString string;
0390     const int codeSize = m_code.size();
0391     //skip first detected '"'
0392     ++m_pos;
0393
0394     if (m_pos >= codeSize) {
0395         m_type = TokError;
0396         return m_type;
0397     }
0398
0399     //while not at the end of the string '"'
0400     while (!(m_code[m_pos] == '"')) {
0401         UChar cur = m_code[m_pos];
0402         if (cur == UChar('\\')) {
0403             ++m_pos;
0404             bool error = false;
0405             string.append(parseEscapeChar(&error));
0406             if (error) {
0407                 m_type = TokError;
0408                 return m_type;
0409             }
0410         } else {
0411             if (cur.uc <= InvalidJSONUnicode) {
0412                 m_type = TokError;
0413                 return m_type;
0414             }
0415             string.append(cur);
0416             ++m_pos;
0417         }
0418
0419         if (m_pos >= codeSize) {
0420             m_type = TokError;
0421             return m_type;
0422         }
0423     }
0424
0425     m_type = TokString;
0426     m_stringToken = string;
0427     ++m_pos;
0428 #ifdef JSONLEXER_DEBUG_VERBOSE
0429     fprintf(stderr, "JSONLexer::lexString: Pos:%d stringlength:%d string:%s\n", m_pos, string.size(), string.ascii());
0430 #endif
0431     return m_type;
0432 }
0433
0434 TokenType JSONLexer::lexNumber()
0435 {
0436     const int start = m_pos;
0437     const int codeSize = m_code.size();
0438
0439     // -?(0 | [1-9][0-9]*) ('.' [0-9]+)? ([eE][+-]? [0-9]+)?
0440
0441     // -?
0442     if (m_pos < codeSize && m_code[m_pos] == '-') {
0443         ++m_pos;
0444     }
0445
0446     // (0 | [1-9][0-9]*)
0447     if (m_pos < codeSize && m_code[m_pos] == '0') {
0448         ++m_pos;
0449     } else if (m_pos < codeSize) {
0450         while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) {
0451             ++m_pos;
0452         }
0453     } else {
0454         m_type = TokError;
0455         return m_type;
0456     }
0457
0458     // ('.' [0-9]+)?
0459     if (m_pos < codeSize && m_code[m_pos] == '.') {
0460         ++m_pos;
0461         // [0-9]+
0462         if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) {
0463             m_type = TokError;
0464             return m_type;
0465         }
0466         ++m_pos;
0467
0468         while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) {
0469             ++m_pos;
0470         }
0471     }
0472
0473     //  ([eE][+-]? [0-9]+)?
0474     if (m_pos < codeSize && (m_code[m_pos] == 'e' || m_code[m_pos] == 'E')) { // [eE]
0475         ++m_pos;
0476
0477         // [-+]?
0478         if (m_pos < codeSize && (m_code[m_pos] == '-' || m_code[m_pos] == '+')) {
0479             ++m_pos;
0480         }
0481
0482         // [0-9]+
0483         if (m_pos >= codeSize || !isDecimalDigit(m_code[m_pos])) {
0484             m_type = TokError;
0485             return m_type;
0486         }
0487
0488         ++m_pos;
0489         while (m_pos < codeSize && isDecimalDigit(m_code[m_pos])) {
0490             ++m_pos;
0491         }
0492     }
0493
0494     m_numberToken = m_code.substr(start, m_pos - start).toDouble(false, false);
0495     m_type = TokNumber;
0496 #ifdef JSONLEXER_DEBUG_VERBOSE
0497     fprintf(stderr, "Number: %f\n", m_numberToken);
0498 #endif
0499     return m_type;
0500 }
0501
0502 UChar JSONLexer::parseEscapeChar(bool *error)
0503 {
0504     UChar cur = m_code[m_pos];
0505     switch (cur.uc) {
0506     case '"':
0507     case '\\':
0508     case '/':
0509         ++m_pos;
0510         return cur;
0511     case 'b':
0512         ++m_pos;
0513         return UChar('\b');
0514     case 'f':
0515         ++m_pos;
0516         return UChar('\f');
0517     case 'n':
0518         ++m_pos;
0519         return UChar('\n');
0520     case 'r':
0521         ++m_pos;
0522         return UChar('\r');
0523     case 't':
0524         ++m_pos;
0525         return UChar('\t');
0526     case 'u': {
0527         if ((m_code.size() - (m_pos + 1)) < 4) {
0528             *error = true;
0529             return UChar(' ');
0530         }
0531         if (!isHexDigit(m_code[m_pos + 1]) || !isHexDigit(m_code[m_pos + 2]) ||
0532                 !isHexDigit(m_code[m_pos + 3]) || !isHexDigit(m_code[m_pos + 4])) {
0533             *error = true;
0534             return UChar(' ');
0535         }
0536
0537         UChar next = Lexer::convertUnicode(m_code[m_pos + 1].uc, m_code[m_pos + 2].uc, m_code[m_pos + 3].uc, m_code[m_pos + 4].uc);
0538
0539         *error = false;
0540         m_pos += 5;
0541         return next;
0542     }
0543     default:
0544         *error = true;
0545         return UChar(' ');
0546     }
0547 }
0548
0549 //helper function, checks if "word" is in the "code" at "pos".
0550 static inline bool isStringSequence(int pos, const UString &code, const UString &word)
0551 {
0552     const int wordSize = word.size();
0553     if (pos + wordSize > code.size()) {
0554         return false;
0555     }
0556
0557     //Skip first, we already checked it
0558     for (int i = 1; i < wordSize; ++i) {
0559         if (code[pos + i].uc != word[i].uc) {
0560             return false;
0561         }
0562     }
0563     return true;
0564 }
0565
0566 TokenType JSONLexer::next()
0567 {
0568     while (true) {
0569         if (m_pos >= m_code.size()) {
0570             m_type = TokEnd;
0571             return m_type;
0572         }
0573
0574         if (!isJSONWhiteSpace(m_code[m_pos])) {
0575             break;
0576         }
0577         ++m_pos;
0578     }
0579
0580     m_type = TokError;
0581
0582 #ifdef JSONLEXER_DEBUG_VERBOSE
0583     fprintf(stderr, "JSONLexer::next current: %c \t\t pos: %d/%d\n", char(m_code[m_pos].uc), m_pos, m_code.size());
0584 #endif
0585
0586     switch (m_code[m_pos].uc) {
0587     case '[':
0588         m_type = TokLBracket;
0589         ++m_pos;
0590         return m_type;
0591     case ']':
0592         m_type = TokRBracket;
0593         ++m_pos;
0594         return m_type;
0595     case '(':
0596         m_type = TokLParen;
0597         ++m_pos;
0598         return m_type;
0599     case ')':
0600         m_type = TokRParen;
0601         ++m_pos;
0602         return m_type;
0603     case '{':
0604         m_type = TokLBrace;
0605         ++m_pos;
0606         return m_type;
0607     case '}':
0608         m_type = TokRBrace;
0609         ++m_pos;
0610         return m_type;
0611     case ',':
0612         m_type = TokComma;
0613         ++m_pos;
0614         return m_type;
0615     case ':':
0616         m_type = TokColon;
0617         ++m_pos;
0618         return m_type;
0619     case '"':
0620         return lexString();
0621
0622     case 't':
0623         if (isStringSequence(m_pos, m_code, "true")) {
0624             m_type = TokTrue;
0625             m_pos += 4;
0626             return m_type;
0627         }
0628         break;
0629     case 'f':
0630         if (isStringSequence(m_pos, m_code, "false")) {
0631             m_type = TokFalse;
0632             m_pos += 5;
0633             return m_type;
0634         }
0635         break;
0636     case 'n':
0637         if (isStringSequence(m_pos, m_code, "null")) {
0638             m_type = TokNull;
0639             m_pos += 4;
0640             return m_type;
0641         }
0642         break;
0643     case '-':
0644     case '0':
0645     case '1':
0646     case '2':
0647     case '3':
0648     case '4':
0649     case '5':
0650     case '6':
0651     case '7':
0652     case '8':
0653     case '9':
0654         return lexNumber();
0655     }
0656     return m_type;
0657 }
0658
0659 bool JSONParser::nextParseIsEOF()
0660 {
0661     return m_lexer.next() == TokEnd;
0662 }
0663
0664 } // namespace KJS