src/kjs/lexer.cpp

0001 /*
0002  *  This file is part of the KDE libraries
0003  *  Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
0004  *  Copyright (C) 2006 Apple Computer, Inc.
0005  *  Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
0006  *
0007  *  This library is free software; you can redistribute it and/or
0008  *  modify it under the terms of the GNU Library General Public
0009  *  License as published by the Free Software Foundation; either
0010  *  version 2 of the License, or (at your option) any later version.
0011  *
0012  *  This library is distributed in the hope that it will be useful,
0013  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
0014  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0015  *  Library General Public License for more details.
0016  *
0017  *  You should have received a copy of the GNU Library General Public License
0018  *  along with this library; see the file COPYING.LIB.  If not, write to
0019  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0020  *  Boston, MA 02110-1301, USA.
0021  *
0022  */
0023
0024 #include "lexer.h"
0025 #include <string.h>
0026 #include <limits.h>
0027
0028 #include "dtoa.h"
0029 #include "function.h"
0030 #include "interpreter.h"
0031 #include "nodes.h"
0032 #include "commonunicode.h"
0033 #include "wtf/ASCIICType.h"
0034 #include "wtf/DisallowCType.h"
0035 #include <wtf/unicode/libc/UnicodeLibC.h>
0036
0037 using namespace WTF;
0038 using namespace Unicode;
0039
0040 // GCC cstring uses these automatically, but not all implementations do.
0041 using std::strlen;
0042 using std::strcpy;
0043 using std::strncpy;
0044 using std::memset;
0045 using std::memcpy;
0046
0047 // we can't specify the namespace in yacc's C output, so do it here
0048 using namespace KJS;
0049
0050 #include "grammar.h"
0051
0052 #include "lookup.h"
0053 #include "lexer.lut.h"
0054
0055 extern YYLTYPE kjsyylloc; // global bison variable holding token info
0056
0057 // a bridge for yacc from the C world to C++
0058 int kjsyylex()
0059 {
0060     return lexer().lex();
0061 }
0062
0063 namespace KJS
0064 {
0065
0066 static bool isDecimalDigit(int c);
0067
0068 static const size_t initialReadBufferCapacity = 32;
0069 static const size_t initialStringTableCapacity = 64;
0070
0071 Lexer &lexer()
0072 {
0073     // ASSERT(JSLock::currentThreadIsHoldingLock());
0074
0075     // FIXME: We'd like to avoid calling new here, but we don't currently
0076     // support tearing down the Lexer at app quit time, since that would involve
0077     // tearing down its UString data members without holding the JSLock.
0078     static Lexer *staticLexer = new Lexer;
0079     return *staticLexer;
0080 }
0081
0082 Lexer::Lexer()
0083     : yylineno(0)
0084     , restrKeyword(false)
0085     , eatNextIdentifier(false)
0086     , stackToken(-1)
0087     , lastToken(-1)
0088     , pos(0)
0089     , code(nullptr)
0090     , length(0)
0091 #ifndef KJS_PURE_ECMA
0092     , bol(true)
0093 #endif
0094     , current(0)
0095     , next1(0)
0096     , next2(0)
0097     , next3(0)
0098 {
0099     m_buffer8.reserveCapacity(initialReadBufferCapacity);
0100     m_buffer16.reserveCapacity(initialReadBufferCapacity);
0101     m_strings.reserveCapacity(initialStringTableCapacity);
0102     m_identifiers.reserveCapacity(initialStringTableCapacity);
0103 }
0104
0105 void Lexer::setCode(const UString &sourceURL, int startingLineNumber, const KJS::UChar *c, unsigned int len)
0106 {
0107     yylineno = startingLineNumber;
0108     m_sourceURL = sourceURL;
0109     restrKeyword = false;
0110     delimited = false;
0111     eatNextIdentifier = false;
0112     stackToken = -1;
0113     lastToken = -1;
0114     pos = 0;
0115     code = c;
0116     length = len;
0117     skipLF = false;
0118     skipCR = false;
0119     error = false;
0120 #ifndef KJS_PURE_ECMA
0121     bol = true;
0122 #endif
0123
0124     // read first characters
0125     current = (length > 0) ? code[0].uc : -1;
0126     next1 = (length > 1) ? code[1].uc : -1;
0127     next2 = (length > 2) ? code[2].uc : -1;
0128     next3 = (length > 3) ? code[3].uc : -1;
0129 }
0130
0131 void Lexer::shift(unsigned int p)
0132 {
0133     // Here would be a good place to strip Cf characters, but that has caused compatibility problems:
0134     // <http://bugs.webkit.org/show_bug.cgi?id=10183>.
0135     while (p--) {
0136         current = next1;
0137         next1 = next2;
0138         next2 = next3;
0139         pos++;
0140         next3 = (pos + 3 < length) ? code[pos + 3].uc : -1;
0141     }
0142 }
0143
0144 // called on each new line
0145 void Lexer::nextLine()
0146 {
0147     yylineno++;
0148 #ifndef KJS_PURE_ECMA
0149     bol = true;
0150 #endif
0151 }
0152
0153 void Lexer::setDone(State s)
0154 {
0155     state = s;
0156     done = true;
0157 }
0158
0159 int Lexer::lex()
0160 {
0161     int token = 0;
0162     state = Start;
0163     unsigned short stringType = 0; // either single or double quotes
0164     m_buffer8.clear();
0165     m_buffer16.clear();
0166     done = false;
0167     terminator = false;
0168     skipLF = false;
0169     skipCR = false;
0170
0171     // did we push a token on the stack previously ?
0172     // (after an automatic semicolon insertion)
0173     if (stackToken >= 0) {
0174         setDone(Other);
0175         token = stackToken;
0176         stackToken = 0;
0177     }
0178
0179     while (!done) {
0180         if (skipLF && current != '\n') { // found \r but not \n afterwards
0181             skipLF = false;
0182         }
0183         if (skipCR && current != '\r') { // found \n but not \r afterwards
0184             skipCR = false;
0185         }
0186         if (skipLF || skipCR) { // found \r\n or \n\r -> eat the second one
0187             skipLF = false;
0188             skipCR = false;
0189             shift(1);
0190         }
0191         switch (state) {
0192         case Start:
0193             if (isWhiteSpace()) {
0194                 // do nothing
0195             } else if (current == '/' && next1 == '/') {
0196                 shift(1);
0197                 state = InSingleLineComment;
0198             } else if (current == '/' && next1 == '*') {
0199                 shift(1);
0200                 state = InMultiLineComment;
0201             } else if (current == -1) {
0202                 if (!terminator && !delimited) {
0203                     // automatic semicolon insertion if program incomplete
0204                     token = ';';
0205                     stackToken = 0;
0206                     setDone(Other);
0207                 } else {
0208                     setDone(Eof);
0209                 }
0210             } else if (isLineTerminator()) {
0211                 nextLine();
0212                 terminator = true;
0213                 if (restrKeyword) {
0214                     token = ';';
0215                     setDone(Other);
0216                 }
0217             } else if (current == '"' || current == '\'') {
0218                 state = InString;
0219                 stringType = static_cast<unsigned short>(current);
0220             } else if (isIdentStart(current)) {
0221                 record16(current);
0222                 state = InIdentifierOrKeyword;
0223             } else if (current == '\\') {
0224                 state = InIdentifierStartUnicodeEscapeStart;
0225             } else if (current == '0') {
0226                 record8(current);
0227                 state = InNum0;
0228             } else if (isDecimalDigit(current)) {
0229                 record8(current);
0230                 state = InNum;
0231             } else if (current == '.' && isDecimalDigit(next1)) {
0232                 record8(current);
0233                 state = InDecimal;
0234 #ifndef KJS_PURE_ECMA
0235                 // <!-- marks the beginning of a line comment (for www usage)
0236             } else if (current == '<' && next1 == '!' &&
0237                        next2 == '-' && next3 == '-') {
0238                 shift(3);
0239                 state = InSingleLineComment;
0240                 // same for -->
0241             } else if (bol && current == '-' && next1 == '-' &&  next2 == '>') {
0242                 shift(2);
0243                 state = InSingleLineComment;
0244 #endif
0245             } else {
0246                 token = matchPunctuator(current, next1, next2, next3);
0247                 if (token != -1) {
0248                     setDone(Other);
0249                 } else {
0250                     //      cerr << "encountered unknown character" << endl;
0251                     setDone(Bad);
0252                 }
0253             }
0254             break;
0255         case InString:
0256             switch (current) {
0257             case '\'':
0258             case '"':
0259                 if (current == stringType) {
0260                     shift(1);
0261                     setDone(String);
0262                 } else {
0263                     record16(current);
0264                 }
0265                 break;
0266             case '\\':
0267                 state = InEscapeSequence;
0268                 break;
0269             case '\n':
0270             case '\r':
0271             case 0x2028:
0272             case 0x2029:
0273             case -1:
0274                 // encountered newline or eof
0275                 setDone(Bad);
0276                 break;
0277             default:
0278                 record16(current);
0279                 break;
0280             }
0281             break;
0282         // Escape Sequences inside of strings
0283         case InEscapeSequence:
0284             if (isOctalDigit(current)) {
0285                 if (current >= '0' && current <= '3' &&
0286                         isOctalDigit(next1) && isOctalDigit(next2)) {
0287                     record16(convertOctal(current, next1, next2));
0288                     shift(2);
0289                     state = InString;
0290                 } else if (isOctalDigit(current) && isOctalDigit(next1)) {
0291                     record16(convertOctal('0', current, next1));
0292                     shift(1);
0293                     state = InString;
0294                 } else if (isOctalDigit(current)) {
0295                     record16(convertOctal('0', '0', current));
0296                     state = InString;
0297                 } else {
0298                     setDone(Bad);
0299                 }
0300             } else if (current == 'x') {
0301                 state = InHexEscape;
0302             } else if (current == 'u') {
0303                 state = InUnicodeEscape;
0304             } else if (isLineTerminator()) {
0305                 nextLine();
0306                 state = InString;
0307             } else {
0308                 record16(singleEscape(static_cast<unsigned short>(current)));
0309                 state = InString;
0310             }
0311             break;
0312         case InHexEscape:
0313             if (isHexDigit(current) && isHexDigit(next1)) {
0314                 state = InString;
0315                 record16(convertHex(current, next1));
0316                 shift(1);
0317             } else {
0318                 setDone(Bad);
0319             }
0320             break;
0321         case InUnicodeEscape:
0322             if (isHexDigit(current) && isHexDigit(next1) && isHexDigit(next2) && isHexDigit(next3)) {
0323                 record16(convertUnicode(current, next1, next2, next3));
0324                 shift(3);
0325                 state = InString;
0326             } else if (current == stringType) {
0327                 record16('u');
0328                 shift(1);
0329                 setDone(String);
0330             } else {
0331                 setDone(Bad);
0332             }
0333             break;
0334         case InSingleLineComment:
0335             if (isLineTerminator()) {
0336                 nextLine();
0337                 terminator = true;
0338                 if (restrKeyword) {
0339                     token = ';';
0340                     setDone(Other);
0341                 } else {
0342                     state = Start;
0343                 }
0344             } else if (current == -1) {
0345                 setDone(Eof);
0346             }
0347             break;
0348         case InMultiLineComment:
0349             if (current == -1) {
0350                 setDone(Bad);
0351             } else if (isLineTerminator()) {
0352                 nextLine();
0353             } else if (current == '*' && next1 == '/') {
0354                 state = Start;
0355                 shift(1);
0356             }
0357             break;
0358         case InIdentifierOrKeyword:
0359         case InIdentifier:
0360             if (isIdentPart(current)) {
0361                 record16(current);
0362             } else if (current == '\\') {
0363                 state = InIdentifierPartUnicodeEscapeStart;
0364             } else {
0365                 setDone(state == InIdentifierOrKeyword ? IdentifierOrKeyword : Identifier);
0366             }
0367             break;
0368         case InNum0:
0369             if (current == 'x' || current == 'X') {
0370                 m_buffer8.clear();
0371                 state = InHex;
0372             } else if (current == 'b' || current == 'B') {
0373                 m_buffer8.clear();
0374                 state = InBinary;
0375             } else if (current == 'o' || current == 'O') {
0376                 m_buffer8.clear();
0377                 state = InOctal;
0378             } else if (current == '.') {
0379                 record8(current);
0380                 state = InDecimal;
0381             } else if (current == 'e' || current == 'E') {
0382                 record8(current);
0383                 state = InExponentIndicator;
0384             } else if (isOctalDigit(current)) {
0385                 record8(current);
0386                 state = InLegacyOctal;
0387             } else if (isDecimalDigit(current)) {
0388                 record8(current);
0389                 state = InDecimal;
0390             } else {
0391                 setDone(Number);
0392             }
0393             break;
0394         case InHex:
0395             if (isHexDigit(current)) {
0396                 record8(current);
0397             } else {
0398                 setDone(Hex);
0399             }
0400             break;
0401         case InOctal:
0402             if (isOctalDigit(current)) {
0403                 record8(current);
0404             } else if (isDecimalDigit(current)) {
0405                 setDone(Bad);
0406             } else {
0407                 setDone(Octal);
0408             }
0409             break;
0410         case InLegacyOctal:
0411             if (isOctalDigit(current)) {
0412                 record8(current);
0413             } else if (isDecimalDigit(current)) {
0414                 record8(current);
0415                 state = InDecimal;
0416             } else {
0417                 setDone(Octal);
0418             }
0419             break;
0420         case InBinary:
0421             if (isBinaryDigit(current)) {
0422                 record8(current);
0423             } else if (isDecimalDigit(current)) {
0424                 setDone(Bad);
0425             } else {
0426                 setDone(Binary);
0427             }
0428             break;
0429         case InNum:
0430             if (isDecimalDigit(current)) {
0431                 record8(current);
0432             } else if (current == '.') {
0433                 record8(current);
0434                 state = InDecimal;
0435             } else if (current == 'e' || current == 'E') {
0436                 record8(current);
0437                 state = InExponentIndicator;
0438             } else {
0439                 setDone(Number);
0440             }
0441             break;
0442         case InDecimal:
0443             if (isDecimalDigit(current)) {
0444                 record8(current);
0445             } else if (current == 'e' || current == 'E') {
0446                 record8(current);
0447                 state = InExponentIndicator;
0448             } else {
0449                 setDone(Number);
0450             }
0451             break;
0452         case InExponentIndicator:
0453             if (current == '+' || current == '-') {
0454                 record8(current);
0455             } else if (isDecimalDigit(current)) {
0456                 record8(current);
0457                 state = InExponent;
0458             } else {
0459                 setDone(Bad);
0460             }
0461             break;
0462         case InExponent:
0463             if (isDecimalDigit(current)) {
0464                 record8(current);
0465             } else {
0466                 setDone(Number);
0467             }
0468             break;
0469         case InIdentifierStartUnicodeEscapeStart:
0470             if (current == 'u') {
0471                 state = InIdentifierStartUnicodeEscape;
0472             } else {
0473                 setDone(Bad);
0474             }
0475             break;
0476         case InIdentifierPartUnicodeEscapeStart:
0477             if (current == 'u') {
0478                 state = InIdentifierPartUnicodeEscape;
0479             } else {
0480                 setDone(Bad);
0481             }
0482             break;
0483         case InIdentifierStartUnicodeEscape:
0484             if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
0485                 setDone(Bad);
0486                 break;
0487             }
0488             token = convertUnicode(current, next1, next2, next3).uc;
0489             shift(3);
0490             if (!isIdentStart(token)) {
0491                 setDone(Bad);
0492                 break;
0493             }
0494             record16(token);
0495             state = InIdentifier;
0496             break;
0497         case InIdentifierPartUnicodeEscape:
0498             if (!isHexDigit(current) || !isHexDigit(next1) || !isHexDigit(next2) || !isHexDigit(next3)) {
0499                 setDone(Bad);
0500                 break;
0501             }
0502             token = convertUnicode(current, next1, next2, next3).uc;
0503             shift(3);
0504             if (!isIdentPart(token)) {
0505                 setDone(Bad);
0506                 break;
0507             }
0508             record16(token);
0509             state = InIdentifier;
0510             break;
0511         default:
0512             assert(!"Unhandled state in switch statement");
0513         }
0514
0515         // move on to the next character
0516         if (!done) {
0517             shift(1);
0518         }
0519 #ifndef KJS_PURE_ECMA
0520         if (state != Start && state != InMultiLineComment) {
0521             bol = false;
0522         }
0523 #endif
0524     }
0525
0526     // no identifiers allowed directly after numeric literal, e.g. "3in" is bad
0527     if ((state == Number || state == Octal || state == Hex || state == Binary) &&
0528     isIdentStart(current)) {
0529         state = Bad;
0530     }
0531
0532     // terminate string
0533     m_buffer8.append('\0');
0534
0535 #ifdef KJS_DEBUG_LEX
0536     fprintf(stderr, "line: %d ", lineNo());
0537     fprintf(stderr, "yytext (%x): ", m_buffer8[0]);
0538     fprintf(stderr, "%s ", m_buffer8.data());
0539 #endif
0540
0541     double dval = 0;
0542     if (state == Number) {
0543         dval = kjs_strtod(m_buffer8.data(), nullptr);
0544     } else if (state == Hex) { // scan hex numbers
0545         // buffer contains "...\0" found after 0x
0546         if (m_buffer8.size() > 1) {
0547             const char *p = m_buffer8.data();
0548             while (char c = *p++) {
0549                 dval *= 16;
0550                 dval += convertHex(c);
0551             }
0552             if (dval >= mantissaOverflowLowerBound) {
0553                 dval = parseIntOverflow(m_buffer8.data(), m_buffer8.size() - 1, 16);
0554             }
0555             state = Number;
0556         } else {
0557             // no digits seen after 0x
0558             state = Bad;
0559         }
0560     } else if (state == Octal) {   // scan octal number
0561         // buffer contains "...\0" found after 0o
0562         if (m_buffer8.size() > 1) {
0563             const char *p = m_buffer8.data();
0564             while (char c = *p++) {
0565                 dval *= 8;
0566                 dval += c - '0';
0567             }
0568             if (dval >= mantissaOverflowLowerBound) {
0569                 dval = parseIntOverflow(m_buffer8.data(), m_buffer8.size() - 1, 8);
0570             }
0571             state = Number;
0572         } else {
0573             // no octal digits after 0o
0574             state = Bad;
0575         }
0576     } else if (state == Binary) { // scan binary numbers
0577         // buffer contains the binary digits after "0b". E.g. "1010\0"
0578         if (m_buffer8.size () > 1) {
0579             const char *p = m_buffer8.data();
0580             while (char c = *p++) {
0581                 dval *= 2;
0582                 dval += convertHex(c);
0583             }
0584             if (dval >= mantissaOverflowLowerBound) {
0585                 dval = parseIntOverflow(m_buffer8.data() + 2, p - (m_buffer8.data() + 3), 2);
0586             }
0587             state = Number;
0588         } else {
0589             state = Bad;
0590         }
0591     }
0592
0593 #ifdef KJS_DEBUG_LEX
0594     switch (state) {
0595     case Eof:
0596         printf("(EOF)\n");
0597         break;
0598     case Other:
0599         printf("(Other)\n");
0600         break;
0601     case Identifier:
0602         printf("(Identifier)/(Keyword)\n");
0603         break;
0604     case String:
0605         printf("(String)\n");
0606         break;
0607     case Number:
0608         printf("(Number)\n");
0609         break;
0610     default:
0611         printf("(unknown)");
0612     }
0613 #endif
0614
0615     if (state != Identifier && eatNextIdentifier) {
0616         eatNextIdentifier = false;
0617     }
0618
0619     restrKeyword = false;
0620     delimited = false;
0621     kjsyylloc.first_line = yylineno; // ???
0622     kjsyylloc.last_line = yylineno;
0623
0624     switch (state) {
0625     case Eof:
0626         token = 0;
0627         break;
0628     case Other:
0629         if (token == '}' || token == ';') {
0630             delimited = true;
0631         }
0632         break;
0633     case IdentifierOrKeyword:
0634         if ((token = Lookup::find(&mainTable, m_buffer16.data(), m_buffer16.size())) < 0) {
0635         case Identifier:
0636             // Lookup for keyword failed, means this is an identifier
0637             // Apply anonymous-function hack below (eat the identifier)
0638             if (eatNextIdentifier) {
0639                 eatNextIdentifier = false;
0640                 token = lex();
0641                 break;
0642             }
0643             kjsyylval.ident = makeIdentifier(m_buffer16);
0644             token = IDENT;
0645             break;
0646         }
0647
0648         eatNextIdentifier = false;
0649         // Hack for "f = function somename() { ... }", too hard to get into the grammar
0650         if (token == FUNCTION && lastToken == '=') {
0651             eatNextIdentifier = true;
0652         }
0653
0654         if (token == CONTINUE || token == BREAK ||
0655                 token == RETURN || token == THROW) {
0656             restrKeyword = true;
0657         }
0658         break;
0659     case String:
0660         kjsyylval.ustr = makeUString(m_buffer16);
0661         token = STRING;
0662         break;
0663     case Number:
0664         kjsyylval.dval = dval;
0665         token = NUMBER;
0666         break;
0667     case Bad:
0668 #ifdef KJS_DEBUG_LEX
0669         fprintf(stderr, "KJS: yylex: ERROR.\n");
0670 #endif
0671         error = true;
0672         return -1;
0673     default:
0674         assert(!"unhandled numeration value in switch");
0675         error = true;
0676         return -1;
0677     }
0678     lastToken = token;
0679     return token;
0680 }
0681
0682 bool Lexer::isWhiteSpace() const
0683 {
0684     return CommonUnicode::isWhiteSpace(current);
0685 }
0686
0687 bool Lexer::isLineTerminator()
0688 {
0689     bool cr = (current == '\r');
0690     bool lf = (current == '\n');
0691     if (cr) {
0692         skipLF = true;
0693     } else if (lf) {
0694         skipCR = true;
0695     }
0696     return cr || lf || current == 0x2028 || current == 0x2029;
0697 }
0698
0699 typedef bool (CharacterCheck)(int c);
0700
0701 static bool isIdentStartLibC(int c)
0702 {
0703     return (category(c) & (Letter_Uppercase | Letter_Lowercase |
0704                            Letter_Titlecase | Letter_Modifier | Letter_Other))
0705            || c == '$' || c == '_';
0706 }
0707
0708 static bool isIdentPartLibC(int c)
0709 {
0710     return (category(c) & (Letter_Uppercase | Letter_Lowercase |
0711                            Letter_Titlecase | Letter_Modifier | Letter_Other |
0712                            Mark_NonSpacing | Mark_SpacingCombining |
0713                            Number_DecimalDigit | Punctuation_Connector))
0714            || c == '$' || c == '_';
0715 }
0716
0717 static CharacterCheck *identStart = ::isIdentStartLibC;
0718 static CharacterCheck *identPart = ::isIdentPartLibC;
0719
0720 void Lexer::setIdentStartChecker(bool (*f)(int c))
0721 {
0722     identStart = f;
0723 }
0724
0725 void Lexer::setIdentPartChecker(bool (*f)(int c))
0726 {
0727     identPart = f;
0728 }
0729
0730 bool Lexer::isIdentStart(int c)
0731 {
0732     return (*identStart)(c);
0733 }
0734
0735 bool Lexer::isIdentPart(int c)
0736 {
0737     return (*identPart)(c);
0738 }
0739
0740 static bool isDecimalDigit(int c)
0741 {
0742     return (c >= '0' && c <= '9');
0743 }
0744
0745 bool Lexer::isHexDigit(int c)
0746 {
0747     return ((c >= '0' && c <= '9') ||
0748             (c >= 'a' && c <= 'f') ||
0749             (c >= 'A' && c <= 'F'));
0750 }
0751
0752 bool Lexer::isBinaryDigit(int c)
0753 {
0754     return c == '0' || c == '1';
0755 }
0756
0757 bool Lexer::isOctalDigit(int c)
0758 {
0759     return (c >= '0' && c <= '7');
0760 }
0761
0762 int Lexer::matchPunctuator(int c1, int c2, int c3, int c4)
0763 {
0764     if (c1 == '>' && c2 == '>' && c3 == '>' && c4 == '=') {
0765         shift(4);
0766         return URSHIFTEQUAL;
0767     } else if (c1 == '=' && c2 == '=' && c3 == '=') {
0768         shift(3);
0769         return STREQ;
0770     } else if (c1 == '!' && c2 == '=' && c3 == '=') {
0771         shift(3);
0772         return STRNEQ;
0773     } else if (c1 == '>' && c2 == '>' && c3 == '>') {
0774         shift(3);
0775         return URSHIFT;
0776     } else if (c1 == '<' && c2 == '<' && c3 == '=') {
0777         shift(3);
0778         return LSHIFTEQUAL;
0779     } else if (c1 == '>' && c2 == '>' && c3 == '=') {
0780         shift(3);
0781         return RSHIFTEQUAL;
0782     } else if (c1 == '<' && c2 == '=') {
0783         shift(2);
0784         return LE;
0785     } else if (c1 == '>' && c2 == '=') {
0786         shift(2);
0787         return GE;
0788     } else if (c1 == '!' && c2 == '=') {
0789         shift(2);
0790         return NE;
0791     } else if (c1 == '+' && c2 == '+') {
0792         shift(2);
0793         if (terminator) {
0794             return AUTOPLUSPLUS;
0795         } else {
0796             return PLUSPLUS;
0797         }
0798     } else if (c1 == '-' && c2 == '-') {
0799         shift(2);
0800         if (terminator) {
0801             return AUTOMINUSMINUS;
0802         } else {
0803             return MINUSMINUS;
0804         }
0805     } else if (c1 == '=' && c2 == '=') {
0806         shift(2);
0807         return EQEQ;
0808     } else if (c1 == '+' && c2 == '=') {
0809         shift(2);
0810         return PLUSEQUAL;
0811     } else if (c1 == '-' && c2 == '=') {
0812         shift(2);
0813         return MINUSEQUAL;
0814     } else if (c1 == '*' && c2 == '=') {
0815         shift(2);
0816         return MULTEQUAL;
0817     } else if (c1 == '/' && c2 == '=') {
0818         shift(2);
0819         return DIVEQUAL;
0820     } else if (c1 == '*' && c2 == '*' && c3 == '=') {
0821         shift(3);
0822         return EXPEQUAL;
0823     } else if (c1 == '&' && c2 == '=') {
0824         shift(2);
0825         return ANDEQUAL;
0826     } else if (c1 == '^' && c2 == '=') {
0827         shift(2);
0828         return XOREQUAL;
0829     } else if (c1 == '%' && c2 == '=') {
0830         shift(2);
0831         return MODEQUAL;
0832     } else if (c1 == '|' && c2 == '=') {
0833         shift(2);
0834         return OREQUAL;
0835     } else if (c1 == '<' && c2 == '<') {
0836         shift(2);
0837         return LSHIFT;
0838     } else if (c1 == '>' && c2 == '>') {
0839         shift(2);
0840         return RSHIFT;
0841     } else if (c1 == '&' && c2 == '&') {
0842         shift(2);
0843         return AND;
0844     } else if (c1 == '|' && c2 == '|') {
0845         shift(2);
0846         return OR;
0847     } else if (c1 == '*' && c2 == '*') {
0848     shift(2);
0849     return T_EXP;
0850     }
0851
0852     switch (c1) {
0853     case '=':
0854     case '>':
0855     case '<':
0856     case ',':
0857     case '!':
0858     case '~':
0859     case '?':
0860     case ':':
0861     case '.':
0862     case '+':
0863     case '-':
0864     case '*':
0865     case '/':
0866     case '&':
0867     case '|':
0868     case '^':
0869     case '%':
0870     case '(':
0871     case ')':
0872     case '{':
0873     case '}':
0874     case '[':
0875     case ']':
0876     case ';':
0877         shift(1);
0878         return static_cast<int>(c1);
0879     default:
0880         return -1;
0881     }
0882 }
0883
0884 unsigned short Lexer::singleEscape(unsigned short c)
0885 {
0886     switch (c) {
0887     case 'b':
0888         return 0x08;
0889     case 't':
0890         return 0x09;
0891     case 'n':
0892         return 0x0A;
0893     case 'v':
0894         return 0x0B;
0895     case 'f':
0896         return 0x0C;
0897     case 'r':
0898         return 0x0D;
0899     case '"':
0900         return 0x22;
0901     case '\'':
0902         return 0x27;
0903     case '\\':
0904         return 0x5C;
0905     default:
0906         return c;
0907     }
0908 }
0909
0910 unsigned short Lexer::convertOctal(int c1, int c2, int c3)
0911 {
0912     return static_cast<unsigned short>((c1 - '0') * 64 + (c2 - '0') * 8 + c3 - '0');
0913 }
0914
0915 unsigned char Lexer::convertHex(int c)
0916 {
0917     if (c >= '0' && c <= '9') {
0918         return static_cast<unsigned char>(c - '0');
0919     }
0920     if (c >= 'a' && c <= 'f') {
0921         return static_cast<unsigned char>(c - 'a' + 10);
0922     }
0923     return static_cast<unsigned char>(c - 'A' + 10);
0924 }
0925
0926 unsigned char Lexer::convertHex(int c1, int c2)
0927 {
0928     return ((convertHex(c1) << 4) + convertHex(c2));
0929 }
0930
0931 KJS::UChar Lexer::convertUnicode(int c1, int c2, int c3, int c4)
0932 {
0933     return KJS::UChar((convertHex(c1) << 4) + convertHex(c2),
0934                       (convertHex(c3) << 4) + convertHex(c4));
0935 }
0936
0937 void Lexer::record8(int c)
0938 {
0939     ASSERT(c >= 0);
0940     ASSERT(c <= 0xff);
0941     m_buffer8.append(c);
0942 }
0943
0944 void Lexer::record16(int c)
0945 {
0946     ASSERT(c >= 0);
0947     ASSERT(c <= USHRT_MAX);
0948     record16(UChar(static_cast<unsigned short>(c)));
0949 }
0950
0951 void Lexer::record16(KJS::UChar c)
0952 {
0953     m_buffer16.append(c);
0954 }
0955
0956 bool Lexer::scanRegExp()
0957 {
0958     m_buffer16.clear();
0959     bool lastWasEscape = false;
0960     bool inBrackets = false;
0961
0962     while (1) {
0963         if (isLineTerminator() || current == -1) {
0964             return false;
0965         } else if (current != '/' || lastWasEscape == true || inBrackets == true) {
0966             // keep track of '[' and ']'
0967             if (!lastWasEscape) {
0968                 if (current == '[' && !inBrackets) {
0969                     inBrackets = true;
0970                 }
0971                 if (current == ']' && inBrackets) {
0972                     inBrackets = false;
0973                 }
0974             }
0975             record16(current);
0976             lastWasEscape =
0977                 !lastWasEscape && (current == '\\');
0978         } else { // end of regexp
0979             m_pattern = UString(m_buffer16);
0980             m_buffer16.clear();
0981             shift(1);
0982             break;
0983         }
0984         shift(1);
0985     }
0986
0987     while (isIdentPart(current)) {
0988         record16(current);
0989         shift(1);
0990     }
0991     m_flags = UString(m_buffer16);
0992
0993     return true;
0994 }
0995
0996 void Lexer::clear()
0997 {
0998     deleteAllValues(m_strings);
0999     Vector<UString *> newStrings;
1000     newStrings.reserveCapacity(initialStringTableCapacity);
1001     m_strings.swap(newStrings);
1002     deleteAllValues(m_identifiers);
1003     Vector<KJS::Identifier *> newIdentifiers;
1004     newIdentifiers.reserveCapacity(initialStringTableCapacity);
1005     m_identifiers.swap(newIdentifiers);
1006
1007     Vector<char> newBuffer8;
1008     newBuffer8.reserveCapacity(initialReadBufferCapacity);
1009     m_buffer8.swap(newBuffer8);
1010
1011     Vector<UChar> newBuffer16;
1012     newBuffer16.reserveCapacity(initialReadBufferCapacity);
1013     m_buffer16.swap(newBuffer16);
1014
1015     m_pattern = nullptr;
1016     m_flags = nullptr;
1017     m_sourceURL = nullptr;
1018 }
1019
1020 Identifier *Lexer::makeIdentifier(const Vector<KJS::UChar> &buffer)
1021 {
1022     KJS::Identifier *identifier = new KJS::Identifier(buffer.data(), buffer.size());
1023     m_identifiers.append(identifier);
1024     return identifier;
1025 }
1026
1027 UString *Lexer::makeUString(const Vector<KJS::UChar> &buffer)
1028 {
1029     UString *string = new UString(buffer);
1030     m_strings.append(string);
1031     return string;
1032 }
1033
1034 } // namespace KJS