File indexing completed on 2024-03-24 16:04:29

0001 /*
0002     SPDX-FileCopyrightText: 2008 Niko Sams <niko.sams@gmail.com>
0003 
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006 
0007 #include "phplexer.h"
0008 
0009 #include "phpparser.h"
0010 #include "tokenstream.h"
0011 
0012 #include <QString>
0013 #include <QStringList>
0014 #include <QRegExp>
0015 #include <QDebug>
0016 
0017 #include "parserdebug.h"
0018 
0019 namespace Php
0020 {
0021 
0022 Lexer::Lexer(TokenStream* tokenStream, const QString& content, int initialState):
0023         m_content(content), m_tokenStream(tokenStream),
0024         m_curpos(0), m_contentSize(m_content.size()),
0025         m_tokenBegin(0), m_tokenEnd(0), m_haltCompiler(0)
0026 {
0027     pushState(ErrorState);
0028     if (initialState == DefaultState) {
0029         pushState(HtmlState);
0030     }
0031     pushState(initialState);
0032 }
0033 
0034 int Lexer::state(int deepness) const
0035 {
0036     return m_state.at(m_state.size() - deepness - 1);
0037 }
0038 void Lexer::printState()
0039 {
0040     int s = state();
0041     if (s == ErrorState)
0042         qDebug() << "ErrorState";
0043     else if (s == HtmlState)
0044         qDebug() << "HtmlState";
0045     else if (s == DefaultState)
0046         qDebug() << "DefaultState";
0047     else if (s == String)
0048         qDebug() << "String";
0049     else if (s == StringVariable)
0050         qDebug() << "StringVariable";
0051     else if (s == StringVariableBracket)
0052         qDebug() << "StringVariableBracket";
0053     else if (s == StringVariableObjectOperator)
0054         qDebug() << "StringVariableObjectOperator";
0055     else if (s == StringVariableCurly)
0056         qDebug() << "StringVariableCurly";
0057     else if (s == StringVarname)
0058         qDebug() << "StringVarname";
0059     else if (s == StringHeredoc)
0060         qDebug() << "StringHeredoc";
0061     else if (s == StringBacktick)
0062         qDebug() << "StringBacktick";
0063 }
0064 
0065 void Lexer::pushState(int state)
0066 {
0067     m_state.push(state);
0068 }
0069 
0070 void Lexer::popState()
0071 {
0072     m_state.pop();
0073 }
0074 
0075 int Lexer::nextTokenKind()
0076 {
0077     int token = Parser::Token_INVALID;
0078     if (m_curpos >= m_contentSize) {
0079         m_tokenBegin = -1;
0080         m_tokenEnd = -1;
0081         createNewline(m_curpos);
0082         return 0;
0083     }
0084 
0085     const QChar* it = m_content.constData();
0086     it += m_curpos;
0087     m_tokenBegin = m_curpos;
0088     switch (state()) {
0089     case HtmlState:
0090         if (it->unicode() == '<' && (it + 1)->unicode() == '?'
0091             ///TODO: per-project configuration to set whether we use shortags
0092             ///      or not. In the former case we'd need to rise an error here
0093             && !( (it + 2)->toLower().unicode() == 'x'
0094                  && (it + 3)->toLower().unicode() == 'm'
0095                  && (it + 4)->toLower().unicode() == 'l' ) )
0096         {
0097             token = Parser::Token_OPEN_TAG;
0098             if ((it + 2)->unicode() == '=') {
0099                 token = Parser::Token_OPEN_TAG_WITH_ECHO;
0100                 m_curpos++;
0101                 it++;
0102             } else if ((it + 2)->toLower().unicode() == 'p'
0103                     && (it + 3)->toLower().unicode() == 'h'
0104                     && (it + 4)->toLower().unicode() == 'p'
0105                     && (it + 5)->isSpace()) {
0106                 m_curpos += 4;
0107                 if ((it + 5)->unicode() == '\n') createNewline(m_curpos + 1);
0108             }
0109             m_curpos++;
0110             pushState(DefaultState);
0111         } else {
0112             token = Parser::Token_INLINE_HTML;
0113             while (m_curpos < m_contentSize) {
0114                 if (it->unicode() == '\n') createNewline(m_curpos);
0115                 if ((it + 1)->unicode() == '<' && (it + 2)->unicode() == '?') {
0116                     break;
0117                 }
0118                 it++;
0119                 m_curpos++;
0120             }
0121         }
0122         break;
0123     case DefaultState:
0124     case StringVariableCurly: {
0125         if (it->isSpace()) {
0126             token = Parser::Token_WHITESPACE;
0127             while (m_curpos < m_contentSize && it->isSpace()) {
0128                 if (it->unicode() == '\n') createNewline(m_curpos);
0129                 it++;
0130                 m_curpos++;
0131             }
0132             m_curpos--;
0133         } else if (it->isDigit() || (it->unicode() == '.' && (it + 1)->isDigit())) {
0134             QString num;bool hasPoint = false;
0135             bool hex = false;
0136             bool bin = false;
0137             if (it->unicode() == '0' && (it + 1)->toLower() == 'x') {
0138                 it += 2;
0139                 m_curpos += 2;
0140                 hex = true;
0141             }
0142             if (it->unicode() == '0' && (it + 1)->toLower() == 'b') {
0143                 it += 2;
0144                 m_curpos += 2;
0145                 bin = true;
0146             }
0147             while (m_curpos < m_contentSize && (
0148                         it->isDigit()
0149                         || (!hex && !hasPoint && it->unicode() == '.')
0150                         || (bin && (it->unicode() == '0' || it->unicode() == '1'))
0151                         || (hex && (it->toLower() == 'a' || it->toLower() == 'b' ||
0152                                     it->toLower() == 'c' || it->toLower() == 'd' ||
0153                                     it->toLower() == 'e' || it->toLower() == 'f')))) {
0154                 if (it->unicode() == '.') hasPoint = true;
0155                 num.append(*it);
0156                 it++;
0157                 m_curpos++;
0158             }
0159             if (!hex && !bin && it->toLower() == 'e' &&
0160                     ((it + 1)->isDigit() ||
0161                      (((it + 1)->unicode() == '-' || (it + 1)->unicode() == '+') && (it + 2)->isDigit()))) {
0162                 //exponential number
0163                 token = Parser::Token_DNUMBER;
0164                 m_curpos++;
0165                 it++;
0166                 if (it->unicode() == '-' || it->unicode() == '+') {
0167                     it++;
0168                     m_curpos++;
0169                 }
0170                 while (m_curpos < m_contentSize && (it->isDigit())) {
0171                     it++;
0172                     m_curpos++;
0173                 }
0174                 m_curpos--;
0175             } else {
0176                 m_curpos--;
0177                 if (hasPoint) {
0178                     token = Parser::Token_DNUMBER;
0179                 } else {
0180                     bool ok;
0181                     //check if string can be converted to long
0182                     //if we get an overflow use double
0183                     num.toLong(&ok, hex ? 16 : 10);
0184                     if (ok) {
0185                         token = Parser::Token_LNUMBER;
0186                     } else {
0187                         token = Parser::Token_DNUMBER;
0188                     }
0189                 }
0190             }
0191 
0192         } else if (processVariable(it)) {
0193             token = Parser::Token_VARIABLE;
0194         } else if (it->unicode() == '$') {
0195             //when it was not recognized as variable
0196             token = Parser::Token_DOLLAR;
0197         } else if (it->unicode() == '}') {
0198             token = Parser::Token_RBRACE;
0199             if (state() == StringVariableCurly) {
0200                 popState();
0201             }
0202         } else if (it->unicode() == '{') {
0203             token = Parser::Token_LBRACE;
0204             if (state() == StringVariableCurly) {
0205                 pushState(StringVariableCurly);
0206             }
0207         } else if (it->unicode() == ')') {
0208             token = Parser::Token_RPAREN;
0209         } else if (it->unicode() == '(') {
0210             it++;
0211             int pos = m_curpos + 1;
0212             while (pos < m_contentSize && it->isSpace()) {
0213                 it++;
0214                 pos++;
0215             }
0216             const int nameStart = pos;
0217             while (pos < m_contentSize && it->isLetter()) {
0218                 it++;
0219                 pos++;
0220             }
0221             const auto name = m_content.midRef(nameStart, pos - nameStart);
0222             while (pos < m_contentSize && it->isSpace()) {
0223                 it++;
0224                 pos++;
0225             }
0226             if (it->unicode() == ')') {
0227                 if (name.compare(QLatin1String("int"), Qt::CaseInsensitive) == 0
0228                     || name.compare(QLatin1String("integer"), Qt::CaseInsensitive) == 0)
0229                 {
0230                     token = Parser::Token_INT_CAST;
0231                 } else if (name.compare(QLatin1String("real"), Qt::CaseInsensitive) == 0
0232                     || name.compare(QLatin1String("double"), Qt::CaseInsensitive) == 0
0233                     || name.compare(QLatin1String("float"), Qt::CaseInsensitive) == 0)
0234                 {
0235                     token = Parser::Token_DOUBLE_CAST;
0236                 } else if (name.compare(QLatin1String("string"), Qt::CaseInsensitive) == 0) {
0237                     token = Parser::Token_STRING_CAST;
0238                 } else if (name.compare(QLatin1String("binary"), Qt::CaseInsensitive) == 0) {
0239                     //as in php
0240                     token = Parser::Token_STRING_CAST;
0241                 } else if (name.compare(QLatin1String("array"), Qt::CaseInsensitive) == 0) {
0242                     token = Parser::Token_ARRAY_CAST;
0243                 } else if (name.compare(QLatin1String("object"), Qt::CaseInsensitive) == 0) {
0244                     token = Parser::Token_OBJECT_CAST;
0245                 } else if (name.compare(QLatin1String("bool"), Qt::CaseInsensitive) == 0
0246                     || name.compare(QLatin1String("boolean"), Qt::CaseInsensitive) == 0)
0247                 {
0248                     token = Parser::Token_BOOL_CAST;
0249                 } else if (name.compare(QLatin1String("unset"), Qt::CaseInsensitive) == 0) {
0250                     token = Parser::Token_UNSET_CAST;
0251                 } else {
0252                     token = Parser::Token_LPAREN;
0253                 }
0254 
0255                 if (token != Parser::Token_LPAREN) {
0256                     m_curpos = pos;
0257                 }
0258             } else {
0259                 token = Parser::Token_LPAREN;
0260             }
0261         } else if (it->unicode() == ']') {
0262             token = Parser::Token_RBRACKET;
0263         } else if (it->unicode() == '[') {
0264             token = Parser::Token_LBRACKET;
0265         } else if (it->unicode() == ',') {
0266             token = Parser::Token_COMMA;
0267         } else if (it->unicode() == '@') {
0268             token = Parser::Token_AT;
0269         } else if (it->unicode() == '!') {
0270             if ((it + 1)->unicode() == '=') {
0271                 m_curpos++;
0272                 if ((it + 2)->unicode() == '=') {
0273                     m_curpos++;
0274                     token = Parser::Token_IS_NOT_IDENTICAL;
0275                 } else {
0276                     token = Parser::Token_IS_NOT_EQUAL;
0277                 }
0278             } else {
0279                 token = Parser::Token_BANG;
0280             }
0281         } else if (it->unicode() == '<') {
0282             if ((it + 1)->unicode() == '<') {
0283                 m_curpos++;
0284                 if ((it + 2)->unicode() == '<' && state() != StringVariableCurly) {
0285                     //HEREDOC string (<<< EOD\nfoo\nEOD;\n)
0286                     int pos = 3;
0287                     while (m_curpos + pos < m_contentSize &&
0288                             ((it + pos)->unicode() == ' ' || (it + pos)->unicode() == '\t')) {
0289                         pos++;
0290                     }
0291                     bool isNowdoc = (it + pos)->unicode() == '\'';
0292                     bool foundQuote = isNowdoc || (it + pos)->unicode() == '"';
0293                     if (foundQuote) {
0294                         ++pos;
0295                     }
0296                     if ((it + pos)->isLetter() || (it + pos)->unicode() == '_') { //identifier must start with a letter
0297                         m_hereNowDocIdentifier.clear();
0298                         while (m_curpos + pos < m_contentSize &&
0299                                 ((it + pos)->isDigit() || (it + pos)->isLetter() || (it + pos)->unicode() == '_')) {
0300                             m_hereNowDocIdentifier.append(*(it + pos));
0301                             pos++;
0302                         }
0303                         if (foundQuote && (m_curpos + pos) < m_contentSize) {
0304                             if (isNowdoc && (it+pos)->unicode() == '\'') {
0305                                 ++pos;
0306                             } else if ((it+pos)->unicode() == '"') {
0307                                 ++pos;
0308                             }
0309                         }
0310                         if (m_curpos + pos < m_contentSize && (it + pos)->unicode() == '\n') {
0311                             //identifier must be followed by newline, newline is part of HEREDOC token
0312                             if (isNowdoc) {
0313                                 token = Parser::Token_START_NOWDOC;
0314                                 pushState(StringNowdoc);
0315                             } else {
0316                                 token = Parser::Token_START_HEREDOC;
0317                                 pushState(StringHeredoc);
0318                             }
0319                             m_curpos += pos - 1;
0320                             createNewline(m_curpos);
0321                         }
0322                     }
0323                 }
0324 
0325                 if (token != Parser::Token_START_HEREDOC && token != Parser::Token_START_NOWDOC) {
0326                     if ((it + 2)->unicode() == '=') {
0327                         m_curpos++;
0328                         token = Parser::Token_SL_ASSIGN;
0329                     } else {
0330                         token = Parser::Token_SL;
0331                     }
0332                 }
0333             } else if ((it + 1)->unicode() == '=') {
0334                 if ((it + 2)->unicode() == '>') {
0335                     m_curpos += 2;
0336                     token = Parser::Token_SPACESHIP;
0337                 } else {
0338                     m_curpos++;
0339                     token = Parser::Token_IS_SMALLER_OR_EQUAL;
0340                 }
0341             } else if ((it + 1)->unicode() == '>') {
0342                 m_curpos++;
0343                 token = Parser::Token_IS_NOT_EQUAL;
0344             } else {
0345                 token = Parser::Token_IS_SMALLER;
0346             }
0347         } else if (it->unicode() == '>') {
0348             if ((it + 1)->unicode() == '>') {
0349                 m_curpos++;
0350                 if ((it + 2)->unicode() == '=') {
0351                     m_curpos++;
0352                     token = Parser::Token_SR_ASSIGN;
0353                 } else {
0354                     token = Parser::Token_SR;
0355                 }
0356             } else if ((it + 1)->unicode() == '=') {
0357                 m_curpos++;
0358                 token = Parser::Token_IS_GREATER_OR_EQUAL;
0359             } else {
0360                 token = Parser::Token_IS_GREATER;
0361             }
0362         } else if (it->unicode() == '~') {
0363             token = Parser::Token_TILDE;
0364         } else if (it->unicode() == ':') {
0365             if ((it + 1)->unicode() == ':') {
0366                 m_curpos++;
0367                 token = Parser::Token_PAAMAYIM_NEKUDOTAYIM;
0368             } else {
0369                 token = Parser::Token_COLON;
0370             }
0371         } else if (it->unicode() == '?') {
0372             if ((it + 1)->unicode() == '>') {
0373                 //accept CLOSE_TAG inside StringVariableCurly too, as php does
0374                 token = Parser::Token_CLOSE_TAG;
0375                 m_curpos++;
0376                 while (state() != HtmlState) popState();
0377             } else if ((it + 1)->unicode() == '?') {
0378                 token = Parser::Token_NULL_COALESCE;
0379                 m_curpos++;
0380             } else {
0381                 token = Parser::Token_QUESTION;
0382             }
0383         } else if (it->unicode() == '-' && (it + 1)->unicode() == '>') {
0384             m_curpos++;
0385             token = Parser::Token_OBJECT_OPERATOR;
0386             if (isValidVariableIdentifier(it + 2)) {
0387                 pushState(StringVariableObjectOperator);
0388             }
0389         } else if (it->unicode() == '%') {
0390             if ((it + 1)->unicode() == '=') {
0391                 m_curpos++;
0392                 token = Parser::Token_MOD_ASSIGN;
0393             } else {
0394                 token = Parser::Token_MOD;
0395             }
0396         } else if (it->unicode() == '/') {
0397             if ((it + 1)->unicode() == '=') {
0398                 m_curpos++;
0399                 token = Parser::Token_DIV_ASSIGN;
0400             } else if ((it + 1)->unicode() == '/') {
0401                 //accept COMMENT inside StringVariableCurly too, as php does
0402                 if ((it + 2)->unicode() == '/') {
0403                     token = Parser::Token_DOC_COMMENT;
0404                 } else {
0405                     token = Parser::Token_COMMENT;
0406                 }
0407                 while (m_curpos < m_contentSize) {
0408                     if (m_curpos + 1 < m_contentSize && it->unicode() == '?' && (it + 1)->unicode() == '>') {
0409                         --it;
0410                         --m_curpos;
0411                         break;
0412                     }
0413                     if ( it->unicode() == '\n' ) {
0414                         createNewline(m_curpos);
0415                         if ( token == Parser::Token_COMMENT ) {
0416                             break;
0417                         } else {
0418                             // lookahead to check whether this doc comment spans multiple lines
0419                             const QChar* it2 = it + 1;
0420                             int pos = m_curpos + 1;
0421                             while ( pos < m_contentSize && (it2)->isSpace() && (it2)->unicode() != '\n' ) {
0422                                 ++it2;
0423                                 ++pos;
0424                             }
0425                             if ( it2->unicode() == '/' && (it2 + 1)->unicode() == '/'
0426                                  && (it2 + 2)->unicode() == '/' ) {
0427                                 // seems to be a multi-line doc-comment
0428                                 it = it2 + 2;
0429                                 m_curpos = pos + 2;
0430                                 continue;
0431                             } else {
0432                                 // not a multi-line doc-comment
0433                                 break;
0434                             }
0435                         }
0436                     }
0437                     it++;
0438                     m_curpos++;
0439                 }
0440             } else if ((it + 1)->unicode() == '*') {
0441                 //accept COMMENT inside StringVariableCurly too, as php does
0442                 if ((it + 2)->unicode() == '*' && (it + 3)->isSpace()) {
0443                     token = Parser::Token_DOC_COMMENT;
0444                 } else {
0445                     token = Parser::Token_COMMENT;
0446                 }
0447                 it += 2;
0448                 m_curpos += 2;
0449                 while (m_curpos < m_contentSize && !(it->unicode() == '*' && (it + 1)->unicode() == '/')) {
0450                     if (it->unicode() == '\n') {
0451                         createNewline(m_curpos);
0452                     }
0453                     it++;
0454                     m_curpos++;
0455                 }
0456                 m_curpos++;
0457             } else {
0458                 token = Parser::Token_DIV;
0459             }
0460         } else if (it->unicode() == '#') {
0461             //accept COMMENT inside StringVariableCurly too, as php does
0462             token = Parser::Token_COMMENT;
0463             while (m_curpos < m_contentSize) {
0464                 if (m_curpos + 1 < m_contentSize && it->unicode() == '?' && (it + 1)->unicode() == '>') {
0465                     --it;
0466                     --m_curpos;
0467                     break;
0468                 }
0469                 if (it->unicode() == '\n') {
0470                     createNewline(m_curpos);
0471                     break;
0472                 }
0473                 it++;
0474                 m_curpos++;
0475             }
0476         } else if (it->unicode() == '^') {
0477             if ((it + 1)->unicode() == '=') {
0478                 m_curpos++;
0479                 token = Parser::Token_XOR_ASSIGN;
0480             } else {
0481                 token = Parser::Token_BIT_XOR;
0482             }
0483         } else if (it->unicode() == '*') {
0484             if ((it + 1)->unicode() == '=') {
0485                 m_curpos++;
0486                 token = Parser::Token_MUL_ASSIGN;
0487             } else if ((it + 1)->unicode() == '*') {
0488                 m_curpos++;
0489                 if ((it + 2)->unicode() == '=') {
0490                     m_curpos++;
0491                     token = Parser::Token_EXP_ASSIGN;
0492                 } else {
0493                     token = Parser::Token_EXP;
0494                 }
0495             } else {
0496                 token = Parser::Token_MUL;
0497             }
0498         } else if (it->unicode() == '|') {
0499             if ((it + 1)->unicode() == '|') {
0500                 m_curpos++;
0501                 token = Parser::Token_BOOLEAN_OR;
0502             } else if ((it + 1)->unicode() == '=') {
0503                 m_curpos++;
0504                 token = Parser::Token_OR_ASSIGN;
0505             } else {
0506                 token = Parser::Token_BIT_OR;
0507             }
0508         } else if (it->unicode() == '&') {
0509             if ((it + 1)->unicode() == '&') {
0510                 m_curpos++;
0511                 token = Parser::Token_BOOLEAN_AND;
0512             } else if ((it + 1)->unicode() == '=') {
0513                 m_curpos++;
0514                 token = Parser::Token_AND_ASSIGN;
0515             } else {
0516                 token = Parser::Token_BIT_AND;
0517             }
0518         } else if (it->unicode() == '+') {
0519             if ((it + 1)->unicode() == '+') {
0520                 m_curpos++;
0521                 token = Parser::Token_INC;
0522             } else if ((it + 1)->unicode() == '=') {
0523                 m_curpos++;
0524                 token = Parser::Token_PLUS_ASSIGN;
0525             } else {
0526                 token = Parser::Token_PLUS;
0527             }
0528         } else if (it->unicode() == '-') {
0529             if ((it + 1)->unicode() == '-') {
0530                 m_curpos++;
0531                 token = Parser::Token_DEC;
0532             } else if ((it + 1)->unicode() == '=') {
0533                 m_curpos++;
0534                 token = Parser::Token_MINUS_ASSIGN;
0535             } else {
0536                 token = Parser::Token_MINUS;
0537             }
0538         } else if (it->unicode() == '.') {
0539             if ((it + 1)->unicode() == '=') {
0540                 m_curpos++;
0541                 token = Parser::Token_CONCAT_ASSIGN;
0542             } else if ((it + 1)->unicode() == '.' && (it + 2)->unicode() == '.') {
0543                  m_curpos = m_curpos + 2;
0544                token = Parser::Token_ELLIPSIS;
0545             } else {
0546                 token = Parser::Token_CONCAT;
0547             }
0548         } else if (it->unicode() == '\\') {
0549             token = Parser::Token_BACKSLASH;
0550         } else if (it->unicode() == ';') {
0551             token = Parser::Token_SEMICOLON;
0552         } else if (it->unicode() == '\'') {
0553             token = Parser::Token_CONSTANT_ENCAPSED_STRING;
0554             it++;
0555             m_curpos++;
0556             int startPos = m_curpos;
0557             while (m_curpos < m_contentSize
0558                     && (it->unicode() != '\'' || isEscapedWithBackslash(it, m_curpos, startPos))) {
0559                 if (it->unicode() == '\n') createNewline(m_curpos);
0560                 it++;
0561                 m_curpos++;
0562             }
0563             // if the string is never terminated, make sure we don't overflow the boundaries
0564             if ( m_curpos == m_contentSize ) {
0565                 --m_curpos;
0566             }
0567         } else if (it->unicode() == '"') {
0568             it++;
0569             m_curpos++;
0570             int stringSize = 0;
0571             bool foundVar = false;
0572             while (m_curpos + stringSize < m_contentSize
0573                     && (it->unicode() != '"' || isEscapedWithBackslash(it, m_curpos + stringSize, m_curpos)))
0574             {
0575                 if (it->unicode() == '$'  && !isEscapedWithBackslash(it, m_curpos + stringSize, m_curpos)
0576                         && ((it + 1)->unicode() == '{'
0577                             || (isValidVariableIdentifier(it + 1) && !(it + 1)->isDigit()))) {
0578                     foundVar = true;
0579                     break;
0580                 }
0581                 it++;
0582                 stringSize++;
0583             }
0584             if (!foundVar) {
0585                 // if the string is never terminated, make sure we don't overflow the boundaries
0586                 if ( m_curpos + stringSize == m_contentSize ) {
0587                     m_curpos--;
0588                 }
0589                 token = Parser::Token_CONSTANT_ENCAPSED_STRING;
0590                 it -= stringSize;
0591                 for (int j = 0; j < stringSize; j++) {
0592                     if (it->unicode() == '\n') {
0593                         createNewline(m_curpos + j);
0594                     }
0595                     it++;
0596                 }
0597                 m_curpos += stringSize;
0598             } else {
0599                 // properly set the token pos to the starting double quote
0600                 m_curpos--;
0601                 token = Parser::Token_DOUBLE_QUOTE;
0602                 pushState(String);
0603             }
0604         } else if (it->unicode() == '`') {
0605             token = Parser::Token_BACKTICK;
0606             pushState(StringBacktick);
0607         } else if (it->unicode() == '=') {
0608             if ((it + 1)->unicode() == '=') {
0609                 m_curpos++;
0610                 if ((it + 2)->unicode() == '=') {
0611                     m_curpos++;
0612                     token = Parser::Token_IS_IDENTICAL;
0613                 } else {
0614                     token = Parser::Token_IS_EQUAL;
0615                 }
0616             } else if ((it + 1)->unicode() == '>') {
0617                 m_curpos++;
0618                 token = Parser::Token_DOUBLE_ARROW;
0619             } else {
0620                 token = Parser::Token_ASSIGN;
0621             }
0622         } else if (isValidVariableIdentifier(it) && !it->isDigit()) {
0623             const int from = m_curpos;
0624             while (m_curpos < m_contentSize && (isValidVariableIdentifier(it))) {
0625                 it++;
0626                 m_curpos++;
0627             }
0628             const QStringRef name = m_content.midRef(from, m_curpos - from);
0629             m_curpos--;
0630             if (name.compare(QLatin1String("echo"), Qt::CaseInsensitive) == 0) {
0631                 token = Parser::Token_ECHO;
0632             } else if (name.compare(QLatin1String("include"), Qt::CaseInsensitive) == 0) {
0633                 token = Parser::Token_INCLUDE;
0634             } else if (name.compare(QLatin1String("include_once"), Qt::CaseInsensitive) == 0) {
0635                 token = Parser::Token_INCLUDE_ONCE;
0636             } else if (name.compare(QLatin1String("require"), Qt::CaseInsensitive) == 0) {
0637                 token = Parser::Token_REQUIRE;
0638             } else if (name.compare(QLatin1String("require_once"), Qt::CaseInsensitive) == 0) {
0639                 token = Parser::Token_REQUIRE_ONCE;
0640             } else if (name.compare(QLatin1String("eval"), Qt::CaseInsensitive) == 0) {
0641                 token = Parser::Token_EVAL;
0642             } else if (name.compare(QLatin1String("print"), Qt::CaseInsensitive) == 0) {
0643                 token = Parser::Token_PRINT;
0644             } else if (name.compare(QLatin1String("abstract"), Qt::CaseInsensitive) == 0) {
0645                 token = Parser::Token_ABSTRACT;
0646             } else if (name.compare(QLatin1String("break"), Qt::CaseInsensitive) == 0) {
0647                 token = Parser::Token_BREAK;
0648             } else if (name.compare(QLatin1String("case"), Qt::CaseInsensitive) == 0) {
0649                 token = Parser::Token_CASE;
0650             } else if (name.compare(QLatin1String("catch"), Qt::CaseInsensitive) == 0) {
0651                 token = Parser::Token_CATCH;
0652             } else if (name.compare(QLatin1String("class"), Qt::CaseInsensitive) == 0) {
0653                 token = Parser::Token_CLASS;
0654             } else if (name.compare(QLatin1String("const"), Qt::CaseInsensitive) == 0) {
0655                 token = Parser::Token_CONST;
0656             } else if (name.compare(QLatin1String("continue"), Qt::CaseInsensitive) == 0) {
0657                 token = Parser::Token_CONTINUE;
0658             } else if (name.compare(QLatin1String("default"), Qt::CaseInsensitive) == 0) {
0659                 token = Parser::Token_DEFAULT;
0660             } else if (name.compare(QLatin1String("do"), Qt::CaseInsensitive) == 0) {
0661                 token = Parser::Token_DO;
0662             } else if (name.compare(QLatin1String("else"), Qt::CaseInsensitive) == 0) {
0663                 token = Parser::Token_ELSE;
0664             } else if (name.compare(QLatin1String("extends"), Qt::CaseInsensitive) == 0) {
0665                 token = Parser::Token_EXTENDS;
0666             } else if (name.compare(QLatin1String("final"), Qt::CaseInsensitive) == 0) {
0667                 token = Parser::Token_FINAL;
0668             } else if (name.compare(QLatin1String("for"), Qt::CaseInsensitive) == 0) {
0669                 token = Parser::Token_FOR;
0670             } else if (name.compare(QLatin1String("if"), Qt::CaseInsensitive) == 0) {
0671                 token = Parser::Token_IF;
0672             } else if (name.compare(QLatin1String("implements"), Qt::CaseInsensitive) == 0) {
0673                 token = Parser::Token_IMPLEMENTS;
0674             } else if (name.compare(QLatin1String("instanceof"), Qt::CaseInsensitive) == 0) {
0675                 token = Parser::Token_INSTANCEOF;
0676             } else if (name.compare(QLatin1String("insteadof"), Qt::CaseInsensitive) == 0) {
0677                 token = Parser::Token_INSTEADOF;
0678             } else if (name.compare(QLatin1String("interface"), Qt::CaseInsensitive) == 0) {
0679                 token = Parser::Token_INTERFACE;
0680             } else if (name.compare(QLatin1String("trait"), Qt::CaseInsensitive) == 0) {
0681                 token = Parser::Token_TRAIT;
0682             } else if (name.compare(QLatin1String("new"), Qt::CaseInsensitive) == 0) {
0683                 token = Parser::Token_NEW;
0684             } else if (name.compare(QLatin1String("private"), Qt::CaseInsensitive) == 0) {
0685                 token = Parser::Token_PRIVATE;
0686             } else if (name.compare(QLatin1String("protected"), Qt::CaseInsensitive) == 0) {
0687                 token = Parser::Token_PROTECTED;
0688             } else if (name.compare(QLatin1String("public"), Qt::CaseInsensitive) == 0) {
0689                 token = Parser::Token_PUBLIC;
0690             } else if (name.compare(QLatin1String("return"), Qt::CaseInsensitive) == 0) {
0691                 token = Parser::Token_RETURN;
0692             } else if (name.compare(QLatin1String("static"), Qt::CaseInsensitive) == 0) {
0693                 const QChar* lookAhead = it;
0694                 int pos = m_curpos;
0695                 while (pos < m_contentSize && lookAhead->isSpace()) {
0696                     ++lookAhead;
0697                     ++pos;
0698                 }
0699                 if (pos + 1 < m_contentSize && lookAhead->unicode() == ':' && (++lookAhead)->unicode() == ':') {
0700                     // PHP 5.3 - late static
0701                     token = Parser::Token_STRING;
0702                 } else {
0703                     token = Parser::Token_STATIC;
0704                 }
0705             } else if (name.compare(QLatin1String("switch"), Qt::CaseInsensitive) == 0) {
0706                 token = Parser::Token_SWITCH;
0707             } else if (name.compare(QLatin1String("throw"), Qt::CaseInsensitive) == 0) {
0708                 token = Parser::Token_THROW;
0709             } else if (name.compare(QLatin1String("try"), Qt::CaseInsensitive) == 0) {
0710                 token = Parser::Token_TRY;
0711             } else if (name.compare(QLatin1String("finally"), Qt::CaseInsensitive) == 0) {
0712                 token = Parser::Token_FINALLY;
0713             } else if (name.compare(QLatin1String("while"), Qt::CaseInsensitive) == 0) {
0714                 token = Parser::Token_WHILE;
0715             } else if (name.compare(QLatin1String("clone"), Qt::CaseInsensitive) == 0) {
0716                 token = Parser::Token_CLONE;
0717             } else if (name.compare(QLatin1String("exit"), Qt::CaseInsensitive) == 0 || name.compare(QLatin1String("die"), Qt::CaseInsensitive) == 0) {
0718                 token = Parser::Token_EXIT;
0719             } else if (name.compare(QLatin1String("elseif"), Qt::CaseInsensitive) == 0) {
0720                 token = Parser::Token_ELSEIF;
0721             } else if (name.compare(QLatin1String("endif"), Qt::CaseInsensitive) == 0) {
0722                 token = Parser::Token_ENDIF;
0723             } else if (name.compare(QLatin1String("endwhile"), Qt::CaseInsensitive) == 0) {
0724                 token = Parser::Token_ENDWHILE;
0725             } else if (name.compare(QLatin1String("endfor"), Qt::CaseInsensitive) == 0) {
0726                 token = Parser::Token_ENDFOR;
0727             } else if (name.compare(QLatin1String("foreach"), Qt::CaseInsensitive) == 0) {
0728                 token = Parser::Token_FOREACH;
0729             } else if (name.compare(QLatin1String("endforeach"), Qt::CaseInsensitive) == 0) {
0730                 token = Parser::Token_ENDFOREACH;
0731             } else if (name.compare(QLatin1String("declare"), Qt::CaseInsensitive) == 0) {
0732                 token = Parser::Token_DECLARE;
0733             } else if (name.compare(QLatin1String("enddeclare"), Qt::CaseInsensitive) == 0) {
0734                 token = Parser::Token_ENDDECLARE;
0735             } else if (name.compare(QLatin1String("as"), Qt::CaseInsensitive) == 0) {
0736                 token = Parser::Token_AS;
0737             } else if (name.compare(QLatin1String("endswitch"), Qt::CaseInsensitive) == 0) {
0738                 token = Parser::Token_ENDSWITCH;
0739             } else if (name.compare(QLatin1String("function"), Qt::CaseInsensitive) == 0) {
0740                 token = Parser::Token_FUNCTION;
0741             } else if (name.compare(QLatin1String("use"), Qt::CaseInsensitive) == 0) {
0742                 token = Parser::Token_USE;
0743             } else if (name.compare(QLatin1String("goto"), Qt::CaseInsensitive) == 0) {
0744                 token = Parser::Token_GOTO;
0745             } else if (name.compare(QLatin1String("global"), Qt::CaseInsensitive) == 0) {
0746                 token = Parser::Token_GLOBAL;
0747             } else if (name.compare(QLatin1String("var"), Qt::CaseInsensitive) == 0) {
0748                 token = Parser::Token_VAR;
0749             } else if (name.compare(QLatin1String("unset"), Qt::CaseInsensitive) == 0) {
0750                 token = Parser::Token_UNSET;
0751             } else if (name.compare(QLatin1String("isset"), Qt::CaseInsensitive) == 0) {
0752                 token = Parser::Token_ISSET;
0753             } else if (name.compare(QLatin1String("empty"), Qt::CaseInsensitive) == 0) {
0754                 token = Parser::Token_EMPTY;
0755             } else if (name.compare(QLatin1String("__halt_compiler"), Qt::CaseInsensitive) == 0) {
0756                 token = Parser::Token_HALT_COMPILER;
0757             } else if (name.compare(QLatin1String("list"), Qt::CaseInsensitive) == 0) {
0758                 token = Parser::Token_LIST;
0759             } else if (name.compare(QLatin1String("array"), Qt::CaseInsensitive) == 0) {
0760                 token = Parser::Token_ARRAY;
0761             } else if (name.compare(QLatin1String("__class__"), Qt::CaseInsensitive) == 0) {
0762                 token = Parser::Token_CLASS_C;
0763             } else if (name.compare(QLatin1String("__trait__"), Qt::CaseInsensitive) == 0) {
0764                 token = Parser::Token_TRAIT_C;
0765             } else if (name.compare(QLatin1String("__method__"), Qt::CaseInsensitive) == 0) {
0766                 token = Parser::Token_METHOD_C;
0767             } else if (name.compare(QLatin1String("__function__"), Qt::CaseInsensitive) == 0) {
0768                 token = Parser::Token_FUNC_C;
0769             } else if (name.compare(QLatin1String("__line__"), Qt::CaseInsensitive) == 0) {
0770                 token = Parser::Token_LINE;
0771             } else if (name.compare(QLatin1String("__file__"), Qt::CaseInsensitive) == 0) {
0772                 token = Parser::Token_FILE;
0773             } else if (name.compare(QLatin1String("__dir__"), Qt::CaseInsensitive) == 0) {
0774                 token = Parser::Token_DIR;
0775             } else if (name.compare(QLatin1String("or"), Qt::CaseInsensitive) == 0) {
0776                 token = Parser::Token_LOGICAL_OR;
0777             } else if (name.compare(QLatin1String("and"), Qt::CaseInsensitive) == 0) {
0778                 token = Parser::Token_LOGICAL_AND;
0779             } else if (name.compare(QLatin1String("xor"), Qt::CaseInsensitive) == 0) {
0780                 token = Parser::Token_LOGICAL_XOR;
0781             } else if (name.compare(QLatin1String("namespace"), Qt::CaseInsensitive) == 0) {
0782                 token = Parser::Token_NAMESPACE;
0783             } else if (name.compare(QLatin1String("__namespace__"), Qt::CaseInsensitive) == 0) {
0784                 token = Parser::Token_NAMESPACE_C;
0785             } else if (name.compare(QLatin1String("callable"), Qt::CaseInsensitive) == 0) {
0786                 token = Parser::Token_CALLABLE;
0787             } else if (name.compare(QLatin1String("void"), Qt::CaseInsensitive) == 0) {
0788                 token = Parser::Token_VOID;
0789             } else if (name.compare(QLatin1String("yield"), Qt::CaseInsensitive) == 0) {
0790                 const QChar* lookAhead = it;
0791                 int pos = m_curpos;
0792                 while (pos < m_contentSize && lookAhead->isSpace()) {
0793                     ++lookAhead;
0794                     ++pos;
0795                 }
0796 
0797                 auto nextToken = QString();
0798                 nextToken += *   lookAhead;
0799                 nextToken += * ++lookAhead;
0800                 nextToken += * ++lookAhead;
0801                 nextToken += * ++lookAhead;
0802                 if (pos + 4 < m_contentSize && nextToken == QStringLiteral("from")) {
0803                     m_curpos = pos + 4;
0804                     token = Parser::Token_YIELD_FROM;
0805                 } else {
0806                     token = Parser::Token_YIELD;
0807                 }
0808             } else {
0809                 token = Parser::Token_STRING;
0810             }
0811         }
0812         break;
0813     }
0814 
0815     case StringVariable:
0816     case String:
0817     case StringHeredoc:
0818     case StringBacktick:
0819         if ((state() == String || state(1) == String) && it->unicode() == '"') {
0820             token = Parser::Token_DOUBLE_QUOTE;
0821             if (state() == StringVariable) popState();
0822             popState();
0823         } else if ((state() == StringBacktick || state(1) == StringBacktick) && it->unicode() == '`') {
0824             token = Parser::Token_BACKTICK;
0825             if (state() == StringVariable) popState();
0826             popState();
0827         } else if ((state() == StringHeredoc || state(1) == StringHeredoc) && isHereNowDocEnd(it)) {
0828             token = Parser::Token_END_HEREDOC;
0829             m_curpos += m_hereNowDocIdentifier.length() - 1;
0830             if (state() == StringVariable) popState();
0831             popState();
0832         } else if (processVariable(it)) {
0833             token = Parser::Token_VARIABLE;
0834             if (state() != StringVariable) pushState(StringVariable);
0835         } else if (state() != StringVariable  && it->unicode() == '$' && (it + 1)->unicode() == '{') {
0836             token = Parser::Token_DOLLAR_OPEN_CURLY_BRACES;
0837             m_curpos++;
0838             it += 2;
0839             //check if a valid variable follows
0840             if ((isValidVariableIdentifier(it) && !it->isDigit())) {
0841                 pushState(StringVarname);
0842             }
0843 
0844         } else if (state() == StringVariable && it->unicode() == '[') {
0845             token = Parser::Token_LBRACKET;
0846             pushState(StringVariableBracket);
0847         } else if (state() != StringVariable && it->unicode() == '{' && (it + 1)->unicode() == '$'
0848                    && ((isValidVariableIdentifier(it + 2) && !(it + 2)->isDigit()) || (it + 2)->unicode() == '{')) {
0849             token = Parser::Token_CURLY_OPEN;
0850             pushState(StringVariableCurly);
0851         } else if (state() == StringVariable
0852                    && it->unicode() == '-' && (it + 1)->unicode() == '>'
0853                    && isValidVariableIdentifier(it + 2) && !(it + 2)->isDigit()) {
0854             token = Parser::Token_OBJECT_OPERATOR;
0855             m_curpos++;
0856             pushState(StringVariableObjectOperator);
0857         } else {
0858             if (state() == StringVariable) popState();
0859             token = Parser::Token_ENCAPSED_AND_WHITESPACE;
0860             int startPos = m_curpos;
0861             while (m_curpos < m_contentSize) {
0862                 if (!isEscapedWithBackslash(it, m_curpos, startPos) &&
0863                         ((it->unicode() == '$' && (it + 1)->unicode() == '{') ||
0864                          (it->unicode() == '{' && (it + 1)->unicode() == '$' && isValidVariableIdentifier(it + 2)) ||
0865                          (it->unicode() == '$' && isValidVariableIdentifier(it + 1) && !(it + 1)->isDigit()))) {
0866                     //variable is next ${var} or {$var}
0867                     break;
0868                 }
0869                 if (state() == String && it->unicode() == '"'
0870                         && !isEscapedWithBackslash(it, m_curpos, startPos)) {
0871                     //end of string
0872                     break;
0873                 }
0874                 if (state() == StringBacktick && it->unicode() == '`'
0875                         && !isEscapedWithBackslash(it, m_curpos, startPos)) {
0876                     //end of string
0877                     break;
0878                 }
0879 
0880                 if (it->unicode() == '\n') createNewline(m_curpos);
0881                 m_curpos++;
0882                 it++;
0883 
0884                 if (state() == StringHeredoc && (it - 1)->unicode() == '\n') {
0885                     //check for end of heredoc (\nEOD;\n)
0886                     if (state() == StringHeredoc && isHereNowDocEnd(it)) {
0887                         break;
0888                     }
0889                 }
0890             }
0891             m_curpos--;
0892         }
0893         break;
0894     case StringNowdoc:
0895         if (isHereNowDocEnd(it)) {
0896             token = Parser::Token_END_NOWDOC;
0897             m_curpos += m_hereNowDocIdentifier.length() - 1;
0898             popState();
0899         } else {
0900             token = Parser::Token_STRING;
0901             while (m_curpos < m_contentSize) {
0902                 if (it->unicode() == '\n') createNewline(m_curpos);
0903                 m_curpos++;
0904                 it++;
0905 
0906                 if ((it - 1)->unicode() == '\n' && isHereNowDocEnd(it)) {
0907                     //check for end of nowdoc (\nEOD;\n)
0908                     break;
0909                 }
0910             }
0911             m_curpos--;
0912         }
0913         break;
0914     case StringVariableBracket:
0915         if (it->unicode() == ']') {
0916             token = Parser::Token_RBRACKET;
0917             popState();
0918             popState();
0919         } else if (it->isDigit()) {
0920             token = Parser::Token_NUM_STRING;
0921             while (m_curpos < m_contentSize && it->isDigit()) {
0922                 it++;
0923                 m_curpos++;
0924             }
0925             m_curpos--;
0926         } else {
0927             token = Parser::Token_STRING;
0928             while (m_curpos < m_contentSize && (it->unicode() != ']')) {
0929                 if (it->unicode() == '\n') createNewline(m_curpos);
0930                 it++;
0931                 m_curpos++;
0932             }
0933             m_curpos--;
0934         }
0935         break;
0936     case StringVariableObjectOperator:
0937         token = Parser::Token_STRING;
0938         while (m_curpos < m_contentSize && isValidVariableIdentifier(it)) {
0939             it++;
0940             m_curpos++;
0941         }
0942         m_curpos--;
0943         popState();
0944         if (state() == StringVariable) popState();
0945         break;
0946     case StringVarname:
0947         popState();
0948         pushState(StringVariableCurly);
0949         token = Parser::Token_STRING_VARNAME;
0950         while (m_curpos < m_contentSize && isValidVariableIdentifier(it)) {
0951             it++;
0952             m_curpos++;
0953         }
0954         m_curpos--;
0955         break;
0956     default:
0957         token = Parser::Token_INVALID;
0958         break;
0959     }
0960     if (m_curpos > m_contentSize) {
0961         m_tokenBegin = -1;
0962         m_tokenEnd = -1;
0963         return 0;
0964     }
0965     m_tokenEnd = m_curpos;
0966     m_curpos++;
0967 
0968     if (m_haltCompiler) {
0969         //look for __halt_compiler(); and stop lexer there
0970         if (m_haltCompiler == 4) {
0971             token = 0; //EOF
0972         } else if (token == Parser::Token_WHITESPACE || token == Parser::Token_COMMENT || token == Parser::Token_DOC_COMMENT) {
0973             //ignore
0974         } else if (m_haltCompiler == 1 && token == Parser::Token_LPAREN) {
0975             m_haltCompiler++;
0976         } else if (m_haltCompiler == 2 && token == Parser::Token_RPAREN) {
0977             m_haltCompiler++;
0978         } else if (m_haltCompiler == 3 && token == Parser::Token_SEMICOLON) {
0979             m_haltCompiler++;
0980         } else {
0981             m_haltCompiler = 0;
0982         }
0983     }
0984     if (token == Parser::Token_HALT_COMPILER && !m_haltCompiler) {
0985         m_haltCompiler = 1;
0986     }
0987     return token;
0988 }
0989 
0990 qint64 Lexer::tokenBegin() const
0991 {
0992     return m_tokenBegin;
0993 }
0994 
0995 qint64 Lexer::tokenEnd() const
0996 {
0997     return m_tokenEnd;
0998 }
0999 
1000 bool Lexer::isHereNowDocEnd(const QChar* it)
1001 {
1002     int identiferLen = m_hereNowDocIdentifier.length();
1003     QString lineStart;
1004     for (int i = 0; i < identiferLen; i++) {
1005         if (m_curpos + i >= m_contentSize) break;
1006         lineStart.append(*(it + i));
1007     }
1008     if (lineStart == m_hereNowDocIdentifier &&
1009             ((it + identiferLen)->unicode() == '\n'
1010              || ((it + identiferLen)->unicode() == ';' &&
1011                  (it + identiferLen + 1)->unicode() == '\n'))) {
1012         return true;
1013     }
1014     return false;
1015 }
1016 
1017 //used for strings, to check if " is escaped (\" is, \\" not)
1018 bool Lexer::isEscapedWithBackslash(const QChar* it, int curPos, int startPos)
1019 {
1020     int cnt = 0;
1021     it--;
1022     while (curPos > startPos && it->unicode() == '\\') {
1023         cnt++;
1024         it--;
1025     }
1026     return (cnt % 2) == 1;
1027 }
1028 
1029 bool Lexer::processVariable(const QChar* it)
1030 {
1031     const QChar* c2 = it + 1;
1032     if (it->unicode() == '$' && (isValidVariableIdentifier(c2) && !c2->isDigit())) {
1033         it++;
1034         m_curpos++;
1035         while (m_curpos < m_contentSize
1036                 && (isValidVariableIdentifier(it))) {
1037             it++;
1038             m_curpos++;
1039         }
1040         m_curpos--;
1041         return true;
1042     } else {
1043         return false;
1044     }
1045 }
1046 bool Lexer::isValidVariableIdentifier(const QChar* it)
1047 {
1048     return it->isLetter() || it->isDigit() || it->unicode() == '_' || it->unicode() > 0x7f;
1049 }
1050 
1051 void Lexer::createNewline(int pos)
1052 {
1053     if (m_tokenStream) m_tokenStream->locationTable()->newline(pos);
1054 }
1055 
1056 }
1057