src/html/htmltokenizer.cpp

0001 /*
0002     This file is part of the KDE libraries
0003
0004     Copyright (C) 1997 Martin Jones (mjones@kde.org)
0005               (C) 1997 Torben Weis (weis@kde.org)
0006               (C) 1998 Waldo Bastian (bastian@kde.org)
0007               (C) 1999 Lars Knoll (knoll@kde.org)
0008               (C) 1999 Antti Koivisto (koivisto@kde.org)
0009               (C) 2001-2003 Dirk Mueller (mueller@kde.org)
0010               (C) 2004-2008 Apple Computer, Inc.
0011               (C) 2006-2008 Germain Garand (germain@ebooksfrance.org)
0012
0013     This library is free software; you can redistribute it and/or
0014     modify it under the terms of the GNU Library General Public
0015     License as published by the Free Software Foundation; either
0016     version 2 of the License, or (at your option) any later version.
0017
0018     This library is distributed in the hope that it will be useful,
0019     but WITHOUT ANY WARRANTY; without even the implied warranty of
0020     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0021     Library General Public License for more details.
0022
0023     You should have received a copy of the GNU Library General Public License
0024     along with this library; see the file COPYING.LIB.  If not, write to
0025     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0026     Boston, MA 02110-1301, USA.
0027 */
0028 //----------------------------------------------------------------------------
0029 //
0030 // KDE HTML Widget - Tokenizers
0031
0032 // #define TOKEN_DEBUG 1
0033 // #define TOKEN_DEBUG 2
0034
0035 #include "htmltokenizer.h"
0036 #include "html_documentimpl.h"
0037 #include "htmlparser.h"
0038 #include "dtd.h"
0039
0040 #include <misc/loader.h>
0041
0042 #include <khtmlview.h>
0043 #include <khtml_part.h>
0044 #include <xml/dom_docimpl.h>
0045 #include <ecma/kjs_proxy.h>
0046 #include <kcharsets.h>
0047 #include <ctype.h>
0048 #include <assert.h>
0049 #include <QVariant>
0050 #include "khtml_debug.h"
0051 #include <stdlib.h>
0052
0053 #include "kentities_p.h"
0054 #include "htmlprospectivetokenizer.h"
0055
0056 #define PROSPECTIVE_TOKENIZER_ENABLED 1
0057
0058 using namespace khtml;
0059
0060 static const QChar commentStart [] = { '<', '!', '-', '-', QChar::Null };
0061 static const char doctypeStart [] = "<!doctype";
0062 static const char publicStart [] = "public";
0063 static const char systemStart [] = "system";
0064
0065 static const char scriptEnd [] = "</script";
0066 static const char xmpEnd [] = "</xmp";
0067 static const char styleEnd [] =  "</style";
0068 static const char textareaEnd [] = "</textarea";
0069 static const char titleEnd [] = "</title";
0070
0071 #ifndef NDEBUG
0072 static const int sTokenizerChunkSize = 2048;
0073 static const int sTokenizerFastYieldDelay = 220;
0074 static const int sTokenizerYieldDelay = 650;
0075 #else
0076 static const int sTokenizerChunkSize = 4096;
0077 static const int sTokenizerFastYieldDelay = 180;
0078 static const int sTokenizerYieldDelay = 450;
0079 #endif
0080
0081 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
0082 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
0083 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
0084
0085 // Full support for MS Windows extensions to Latin-1.
0086 // Technically these extensions should only be activated for pages
0087 // marked "windows-1252" or "cp1252", but
0088 // in the standard Microsoft way, these extensions infect hundreds of thousands
0089 // of web pages.  Note that people with non-latin-1 Microsoft extensions
0090 // are SOL.
0091 //
0092 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
0093 //      http://www.bbsinc.com/iso8859.html
0094 //      http://www.obviously.com/
0095 //
0096 // There may be better equivalents
0097 #if 0
0098 #define fixUpChar(x)
0099 #else
0100 #define fixUpChar(x) \
0101     switch ((x).unicode()) \
0102     { \
0103     case 0x80: (x) = 0x20ac; break; \
0104     case 0x82: (x) = 0x201a;    break; \
0105     case 0x83: (x) = 0x0192; break; \
0106     case 0x84: (x) = 0x201e;    break; \
0107     case 0x85: (x) = 0x2026; break; \
0108     case 0x86: (x) = 0x2020; break; \
0109     case 0x87: (x) = 0x2021; break; \
0110     case 0x88: (x) = 0x02C6; break; \
0111     case 0x89: (x) = 0x2030; break; \
0112     case 0x8A: (x) = 0x0160; break; \
0113     case 0x8b: (x) = 0x2039;    break; \
0114     case 0x8C: (x) = 0x0152; break; \
0115     case 0x8E: (x) = 0x017D; break; \
0116     case 0x91: (x) = 0x2018;   break; \
0117     case 0x92: (x) = 0x2019;   break; \
0118     case 0x93: (x) = 0x201C;    break; \
0119     case 0x94: (x) = 0X201D;    break; \
0120     case 0x95: (x) = 0x2022;    break; \
0121     case 0x96: (x) = 0x2013;    break; \
0122     case 0x97: (x) = 0x2014;    break; \
0123     case 0x98: (x) = 0x02DC;    break; \
0124     case 0x99: (x) = 0x2122; break; \
0125     case 0x9A: (x) = 0x0161; break; \
0126     case 0x9b: (x) = 0x203A;    break; \
0127     case 0x9C: (x) = 0x0153; break; \
0128     case 0x9E: (x) = 0x017E; break; \
0129     case 0x9F: (x) = 0x0178; break; \
0130     default: break; \
0131     }
0132 #endif
0133 // ----------------------------------------------------------------------------
0134
0135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
0136 {
0137     view = _view;
0138     buffer = nullptr;
0139     rawContent = nullptr;
0140     rawContentSize = rawContentMaxSize = rawContentResync = rawContentSinceLastEntity = 0;
0141     charsets = KCharsets::charsets();
0142     parser = new KHTMLParser(_view, _doc);
0143     m_executingScript = 0;
0144     m_externalScriptsTimerId = 0;
0145     m_tokenizerYieldDelay = sTokenizerFastYieldDelay;
0146     m_yieldTimer = 0;
0147     m_prospectiveTokenizer = nullptr;
0148     onHold = false;
0149     m_documentTokenizer = true;
0150     m_hasScriptsWaitingForStylesheets = false;
0151
0152     reset();
0153 }
0154
0155 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i)
0156 {
0157     view = nullptr;
0158     buffer = nullptr;
0159     rawContent = nullptr;
0160     rawContentSize = rawContentMaxSize = rawContentResync = rawContentSinceLastEntity = 0;
0161     charsets = KCharsets::charsets();
0162     parser = new KHTMLParser(i, _doc);
0163     m_executingScript = 0;
0164     m_externalScriptsTimerId = 0;
0165     m_tokenizerYieldDelay = sTokenizerFastYieldDelay;
0166     m_yieldTimer = 0;
0167     m_prospectiveTokenizer = nullptr;
0168     onHold = false;
0169     m_documentTokenizer = false;
0170     m_hasScriptsWaitingForStylesheets = false;
0171
0172     reset();
0173 }
0174
0175 void HTMLTokenizer::setNormalYieldDelay()
0176 {
0177     m_tokenizerYieldDelay = sTokenizerYieldDelay;
0178 }
0179
0180 void HTMLTokenizer::reset()
0181 {
0182     assert(m_executingScript == 0);
0183     Q_ASSERT(onHold == false);
0184     m_abort = false;
0185
0186     while (!cachedScript.isEmpty()) {
0187         cachedScript.dequeue()->deref(this);
0188     }
0189
0190     if (buffer) {
0191         KHTML_DELETE_QCHAR_VEC(buffer);
0192     }
0193     buffer = dest = nullptr;
0194     size = 0;
0195
0196     if (rawContent) {
0197         KHTML_DELETE_QCHAR_VEC(rawContent);
0198     }
0199     rawContent = nullptr;
0200     rawContentSize = rawContentMaxSize = rawContentResync = 0;
0201
0202     if (m_yieldTimer > 0) {
0203         killTimer(m_yieldTimer);
0204         m_yieldTimer = 0;
0205     }
0206
0207     if (m_externalScriptsTimerId > 0) {
0208         killTimer(m_externalScriptsTimerId);
0209         m_externalScriptsTimerId = 0;
0210     }
0211     currToken.reset();
0212     doctypeToken.reset();
0213     javascript = false;
0214 }
0215
0216 void HTMLTokenizer::begin()
0217 {
0218     m_executingScript = 0;
0219     onHold = false;
0220     reset();
0221     size = 254;
0222     buffer = KHTML_ALLOC_QCHAR_VEC(255);
0223     dest = buffer;
0224     tag = NoTag;
0225     pending = NonePending;
0226     discard = NoneDiscard;
0227     pre = false;
0228     prePos = 0;
0229     plaintext = false;
0230     xmp = false;
0231     processingInstruction = false;
0232     script = false;
0233     escaped = false;
0234     style = false;
0235     skipLF = false;
0236     select = false;
0237     comment = false;
0238     doctype = false;
0239     doctypeComment = NoDoctypeComment;
0240     doctypeAllowComment = false;
0241     server = false;
0242     textarea = false;
0243     title = false;
0244     startTag = false;
0245     tquote = NoQuote;
0246     searchCount = 0;
0247     doctypeSearchCount = 0;
0248     doctypeSecondarySearchCount = 0;
0249     Entity = NoEntity;
0250     noMoreData = false;
0251     brokenComments = false;
0252     brokenServer = false;
0253     lineno = 0;
0254     scriptStartLineno = 0;
0255     tagStartLineno = 0;
0256 }
0257
0258 void HTMLTokenizer::processListing(TokenizerString list)
0259 {
0260     bool old_pre = pre;
0261
0262     // This function adds the listing 'list' as
0263     // preformatted text-tokens to the token-collection
0264     // thereby converting TABs.
0265     if (!style) {
0266         pre = true;
0267     }
0268     prePos = 0;
0269
0270     while (!list.isEmpty()) {
0271         checkBuffer(3 * TAB_SIZE);
0272
0273         if (skipLF && (list->unicode() != '\n')) {
0274             skipLF = false;
0275         }
0276
0277         if (skipLF) {
0278             skipLF = false;
0279             ++list;
0280         } else if ((list->unicode() == '\n') || (list->unicode() == '\r')) {
0281             if (discard == LFDiscard) {
0282                 // Ignore this LF
0283                 discard = NoneDiscard; // We have discarded 1 LF
0284             } else {
0285                 // Process this LF
0286                 if (pending) {
0287                     addPending();
0288                 }
0289
0290                 // we used to do it not at all and we want to have
0291                 // it fixed for textarea. So here we are
0292                 if (textarea) {
0293                     prePos++;
0294                     *dest++ = *list;
0295                 } else {
0296                     pending = LFPending;
0297                 }
0298             }
0299             /* Check for MS-DOS CRLF sequence */
0300             if (list->unicode() == '\r') {
0301                 skipLF = true;
0302             }
0303             ++list;
0304         } else if ((list->unicode() == ' ') || (list->unicode() == '\t')) {
0305             if (pending) {
0306                 addPending();
0307             }
0308             if (*list == ' ') {
0309                 pending = SpacePending;
0310             } else {
0311                 pending = TabPending;
0312             }
0313
0314             ++list;
0315         } else {
0316             discard = NoneDiscard;
0317             if (pending) {
0318                 addPending();
0319             }
0320
0321             prePos++;
0322             *dest++ = *list;
0323             ++list;
0324         }
0325
0326     }
0327
0328     if ((pending == SpacePending) || (pending == TabPending)) {
0329         addPending();
0330     } else {
0331         pending = NonePending;
0332     }
0333
0334     prePos = 0;
0335     pre = old_pre;
0336 }
0337
0338 void HTMLTokenizer::parseRawContent(TokenizerString &src)
0339 {
0340     // The 'raw content' mode is a very lax tokenizing mode
0341     // that will absorb anything but the exact closing tag
0342     // that made us enter this mode, *except* if it inside a comment.
0343     //
0344     // Any other tag or comment will be passed verbatim to the parser as part
0345     // of the content. It is used for script, style, and a few others.
0346     //
0347     assert(textarea || title || !Entity);
0348     assert(!tag);
0349     assert(xmp + textarea + title + style + script == 1);
0350     if (script) {
0351         scriptStartLineno = lineno + src.lineCount();
0352     }
0353
0354     if (comment) {
0355         parseComment(src);
0356     }
0357
0358     while (!src.isEmpty()) {
0359         checkRawContentBuffer();
0360         unsigned char ch = src->toLatin1();
0361         if (!rawContentResync && !brokenComments && !xmp && ch == '-' &&
0362                 rawContentSize >= 3 && ((!textarea && !title) || rawContentSinceLastEntity >= 3) && !src.escaped() &&
0363                 QString::fromRawData(rawContent + rawContentSize - 3, 3) == "<!-") {
0364             comment = true;
0365             rawContent[ rawContentSize++ ] = ch;
0366             ++src;
0367             parseComment(src);
0368             continue;
0369         }
0370         if (rawContentResync && !tquote && (ch == '>')) {
0371             ++src;
0372             rawContentSize = rawContentResync - 1;
0373             rawContentResync = 0;
0374             rawContent[ rawContentSize ] = rawContent[ rawContentSize + 1 ] = 0;
0375             if (script) {
0376                 scriptHandler();
0377             } else {
0378                 processListing(TokenizerString(rawContent, rawContentSize));
0379                 processToken();
0380                 if (style)         {
0381                     currToken.tid = ID_STYLE + ID_CLOSE_TAG;
0382                 } else if (textarea) {
0383                     currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG;
0384                 } else if (title) {
0385                     currToken.tid = ID_TITLE + ID_CLOSE_TAG;
0386                 } else if (xmp)  {
0387                     currToken.tid = ID_XMP + ID_CLOSE_TAG;
0388                 }
0389                 processToken();
0390                 script = style = textarea = title = xmp = false;
0391                 tquote = NoQuote;
0392                 rawContentSize = rawContentResync = 0;
0393             }
0394             return;
0395         }
0396         // possible end of tagname, lets check.
0397         if (!rawContentResync && !escaped && !src.escaped() && (ch == '>' || ch == '/' || ch <= ' ') && ch &&
0398                 rawContentSize >= searchStopperLen && ((!textarea && !title) || rawContentSinceLastEntity >= searchStopperLen) &&
0399                 QString::compare(QString::fromRawData(rawContent + rawContentSize - searchStopperLen, searchStopperLen),
0400                                  QLatin1String(searchStopper), Qt::CaseInsensitive) == 0) {
0401
0402             // the purpose of rawContentResync is to look for an end tag that could possibly be of the form:
0403             // </endtag   junk="more junk>\"><>"     >
0404             // IOW, once the '</endtag' sequence has been found, the rest of the tag must still be validated,
0405             // so this micro-tokenizer switches to rawContentResync state until '>' is finally found.
0406             rawContentResync = rawContentSize - searchStopperLen + 1;
0407             tquote = NoQuote;
0408             continue;
0409         }
0410         if (rawContentResync && !escaped) {
0411             if (ch == '\"') {
0412                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
0413             } else if (ch == '\'') {
0414                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
0415             } else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) {
0416                 tquote = NoQuote;
0417             }
0418         }
0419         escaped = (!escaped && ch == '\\');
0420         if (!rawContentResync && (textarea || title) && !src.escaped() && ch == '&') {
0421             QChar *rawContentDest = rawContent + rawContentSize;
0422             ++src;
0423             parseEntity(src, rawContentDest, true);
0424             rawContentSize = rawContentDest - rawContent;
0425         } else {
0426             rawContent[ rawContentSize++ ] = *src;
0427             ++src;
0428             ++rawContentSinceLastEntity;
0429         }
0430     }
0431 }
0432
0433 void HTMLTokenizer::scriptHandler()
0434 {
0435     QString currentScriptSrc = scriptSrc;
0436     scriptSrc.clear();
0437
0438     processListing(TokenizerString(rawContent, rawContentSize));
0439     QString exScript(buffer, dest - buffer);
0440
0441     processToken();
0442     currToken.tid = ID_SCRIPT + ID_CLOSE_TAG;
0443     processToken();
0444
0445     // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
0446     bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET);
0447     bool effectiveScript = !parser->skipMode() && !followingFrameset;
0448     bool deferredScript = false;
0449
0450     if (effectiveScript) {
0451         CachedScript *cs = nullptr;
0452
0453         // forget what we just got, load from src url instead
0454         if (!currentScriptSrc.isEmpty() && javascript) {
0455             const QString completeScriptUrl = parser->doc()->completeURL(currentScriptSrc);
0456             cs = parser->doc()->docLoader()->requestScript(completeScriptUrl, scriptSrcCharset);
0457         }
0458
0459         if (cs) {
0460             cachedScript.enqueue(cs);
0461             pendingQueue.push(src);
0462             int scriptCount = cachedScript.count();
0463             setSrc(TokenizerString());
0464             rawContentSize = rawContentResync = 0;
0465             cs->ref(this);
0466             if (cachedScript.count() == scriptCount) {
0467                 deferredScript = true;
0468             }
0469         } else if (currentScriptSrc.isNull()/*no src attribute*/ && view && javascript) {
0470             pendingQueue.push(src);
0471             setSrc(TokenizerString());
0472             rawContentSize = rawContentResync = 0;
0473             scriptExecution(exScript, QString(), tagStartLineno /*scriptStartLineno*/);
0474         } else {
0475             // script was filtered or disallowed
0476             effectiveScript = false;
0477         }
0478     }
0479
0480     script = false;
0481     rawContentSize = rawContentResync = 0;
0482
0483     if (!effectiveScript) {
0484         return;
0485     }
0486
0487     if (!m_executingScript && cachedScript.isEmpty()) {
0488         src.append(pendingQueue.pop());
0489     } else if (cachedScript.isEmpty()) {
0490         write(pendingQueue.pop(), false);
0491     } else if (!deferredScript && pendingQueue.count() > 1) {
0492         TokenizerString t = pendingQueue.pop();
0493         pendingQueue.top().prepend(t);
0494     }
0495 #if PROSPECTIVE_TOKENIZER_ENABLED
0496     if (!cachedScript.isEmpty() && !m_executingScript) {
0497         if (!m_prospectiveTokenizer) {
0498             m_prospectiveTokenizer = new ProspectiveTokenizer(parser->docPtr());
0499         }
0500         if (!m_prospectiveTokenizer->inProgress() && !pendingQueue.isEmpty()) {
0501             m_prospectiveTokenizer->begin();
0502             m_prospectiveTokenizer->write(pendingQueue.top());
0503         }
0504     }
0505 #endif
0506
0507 }
0508
0509 void HTMLTokenizer::scriptExecution(const QString &str, const QString &scriptURL,
0510                                     int baseLine)
0511 {
0512     bool oldscript = script;
0513     m_executingScript++;
0514     script = false;
0515     QString url;
0516     if (scriptURL.isNull() && view) {
0517         url = static_cast<DocumentImpl *>(view->part()->document().handle())->URL().url();
0518     } else {
0519         url = scriptURL;
0520     }
0521
0522     if (view) {
0523         view->part()->executeScript(url, baseLine, Node(), str);
0524     }
0525     m_executingScript--;
0526     script = oldscript;
0527 }
0528
0529 void HTMLTokenizer::parseComment(TokenizerString &src)
0530 {
0531     checkRawContentBuffer(src.length());
0532     while (src.length()) {
0533         rawContent[ rawContentSize++ ] = *src;
0534
0535 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
0536         qDebug("comment is now: *%s*", src.toString().left(16).toLatin1().constData());
0537 #endif
0538
0539         if (src->unicode() == '>') {
0540             bool handleBrokenComments =  brokenComments && !(script || style);
0541             bool scriptEnd = false;
0542             if (rawContentSize > 2 && rawContent[rawContentSize - 3] == '-' &&
0543                     rawContent[rawContentSize - 2] == '-') {
0544                 scriptEnd = true;
0545             }
0546
0547             if (handleBrokenComments || scriptEnd) {
0548                 ++src;
0549                 if (!(title || script || xmp || textarea || style)) {
0550                     checkRawContentBuffer();
0551                     rawContent[ rawContentSize ] = 0;
0552                     rawContent[ rawContentSize + 1 ] = 0;
0553                     currToken.tid = ID_COMMENT;
0554                     int size = scriptEnd ? rawContentSize - 3 : rawContentSize - 1;
0555                     processListing(TokenizerString(rawContent, size));
0556                     processToken();
0557                     currToken.tid = ID_COMMENT + ID_CLOSE_TAG;
0558                     processToken();
0559                     rawContentSize = 0;
0560                 }
0561                 comment = false;
0562                 return; // Finished parsing comment
0563             }
0564         }
0565         ++src;
0566     }
0567 }
0568
0569 void HTMLTokenizer::parseDoctypeComment(TokenizerString &src)
0570 {
0571     while (!src.isEmpty()) {
0572         QChar c = *src;
0573         switch (doctypeComment) {
0574         case DoctypeCommentHalfBegin: {
0575             if (c != '-') {
0576                 // Ooops, it's not comment
0577                 doctypeComment = DoctypeCommentBogus;
0578                 return;
0579             } else {
0580                 // Doctype comment begins
0581                 doctypeComment = DoctypeComment;
0582                 ++src;
0583             }
0584             break;
0585         }
0586         case DoctypeComment: {
0587             if (c == '-') {
0588                 // Perhaps this is end of comment
0589                 doctypeComment = DoctypeCommentHalfEnd;
0590                 ++src;
0591             } else {
0592                 // Keep scanning for '--'
0593                 ++src;
0594             }
0595             break;
0596         }
0597         case DoctypeCommentHalfEnd: {
0598             if (c == '-') {
0599                 // Doctype comment ends
0600                 doctypeComment = DoctypeCommentEnd;
0601                 return;
0602             } else {
0603                 // It's not '--'
0604                 ++src;
0605                 doctypeComment = DoctypeComment;
0606             }
0607             break;
0608         }
0609         default: {
0610             assert(!"Undefined doctype comment state");
0611             break;
0612         }
0613         }
0614     }
0615 }
0616
0617 void HTMLTokenizer::parseDoctype(TokenizerString &src)
0618 {
0619     while (!src.isEmpty() && doctype) {
0620         QChar c;
0621         bool isWhitespace = false;
0622         int dontAdvance = 0;
0623         if (doctypeComment == DoctypeCommentEnd) {
0624             doctypeComment = NoDoctypeComment;
0625             isWhitespace = true;
0626         } else if (doctypeComment == DoctypeCommentBogus) {
0627             doctypeComment = NoDoctypeComment;
0628             c = '-';
0629             dontAdvance++;
0630         } else {
0631             c = *src;
0632             if (doctypeAllowComment) {
0633                 if (!doctypeComment && c == '-') {
0634                     doctypeComment = DoctypeCommentHalfBegin;
0635                     ++src;
0636                 }
0637                 if (doctypeComment) {
0638                     parseDoctypeComment(src);
0639                     continue;
0640                 }
0641                 isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
0642             }
0643         }
0644
0645         switch (doctypeToken.state) {
0646         case DoctypeBegin: {
0647             doctypeToken.state = DoctypeBeforeName;
0648             if (isWhitespace) {
0649                 // nothing
0650             }
0651             break;
0652         }
0653         case DoctypeBeforeName: {
0654             if (c == '>') {
0655                 // Malformed. Just exit.
0656                 doctype = false;
0657             } else if (isWhitespace) {
0658                 // nothing
0659             } else {
0660                 dontAdvance++;
0661                 doctypeToken.state = DoctypeName;
0662             }
0663             break;
0664         }
0665         case DoctypeName: {
0666             if (c == '>') {
0667                 // Valid doctype. Emit it.
0668                 doctype = false;
0669                 processDoctypeToken();
0670             } else if (isWhitespace) {
0671                 doctypeSearchCount = 0; // Used now to scan for PUBLIC
0672                 doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
0673                 doctypeToken.state = DoctypeAfterName;
0674             } else {
0675                 doctypeToken.name.append(c);
0676             }
0677             break;
0678         }
0679         case DoctypeAfterName: {
0680             if (c == '>') {
0681                 // Valid doctype. Emit it.
0682                 doctype = false;
0683                 processDoctypeToken();
0684             } else if (c == '[') {
0685                 if (doctypeSearchCount > 0 || doctypeSecondarySearchCount > 0) { // is there any public/system indicator before?
0686                     doctypeSearchCount = doctypeSecondarySearchCount = 0;
0687                     doctypeToken.state = DoctypeBogus;
0688                 }
0689                 // Found internal subset
0690                 doctypeToken.state = DoctypeInternalSubset;
0691                 doctypeAllowComment = false;
0692             } else if (!isWhitespace) {
0693                 if (c.toLower() == publicStart[doctypeSearchCount]) {
0694                     doctypeSearchCount++;
0695                     if (doctypeSearchCount == 6)
0696                         // Found 'PUBLIC' sequence
0697                     {
0698                         doctypeToken.state = DoctypeBeforePublicID;
0699                     }
0700                 } else if (doctypeSearchCount > 0) {
0701                     doctypeSearchCount = 0;
0702                     doctypeToken.state = DoctypeBogus;
0703                 } else if (c.toLower() == systemStart[doctypeSecondarySearchCount]) {
0704                     doctypeSecondarySearchCount++;
0705                     if (doctypeSecondarySearchCount == 6)
0706                         // Found 'SYSTEM' sequence
0707                     {
0708                         doctypeToken.state = DoctypeBeforeSystemID;
0709                     }
0710                 } else {
0711                     doctypeSecondarySearchCount = 0;
0712                     doctypeToken.state = DoctypeBogus;
0713                 }
0714             } else {
0715                 // Whitespace keeps us in the after name state
0716             }
0717             break;
0718         }
0719         case DoctypeBeforePublicID: {
0720             if (c == '\"' || c == '\'') {
0721                 tquote = c == '\"' ? DoubleQuote : SingleQuote;
0722                 doctypeToken.state = DoctypePublicID;
0723                 doctypeAllowComment = false;
0724             } else if (c == '>') {
0725                 // Considered bogus. Don't process the doctype.
0726                 doctype = false;
0727             } else if (isWhitespace) {
0728                 // nothing
0729             } else {
0730                 doctypeToken.state = DoctypeBogus;
0731             }
0732             break;
0733         }
0734         case DoctypePublicID: {
0735             if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
0736                 doctypeToken.state = DoctypeAfterPublicID;
0737                 doctypeAllowComment = true;
0738             } else if (c == '>') {
0739                 // Considered bogus. Don't process the doctype.
0740                 doctype = false;
0741             } else {
0742                 doctypeToken.publicID.append(c);
0743             }
0744             break;
0745         }
0746         case DoctypeAfterPublicID: {
0747             if (c == '\"' || c == '\'') {
0748                 tquote = c == '\"' ? DoubleQuote : SingleQuote;
0749                 doctypeToken.state = DoctypeSystemID;
0750             } else if (c == '>') {
0751                 // Valid doctype. Emit it now.
0752                 doctype = false;
0753                 processDoctypeToken();
0754             } else if (isWhitespace) {
0755                 // nothing
0756             } else if (c == '[') {
0757                 // Found internal subset
0758                 doctypeToken.state = DoctypeInternalSubset;
0759                 doctypeAllowComment = false;
0760             } else {
0761                 doctypeToken.state = DoctypeBogus;
0762             }
0763             break;
0764         }
0765         case DoctypeBeforeSystemID: {
0766             if (c == '\"' || c == '\'') {
0767                 tquote = c == '\"' ? DoubleQuote : SingleQuote;
0768                 doctypeToken.state = DoctypeSystemID;
0769                 doctypeAllowComment = false;
0770             } else if (c == '>') {
0771                 // Considered bogus. Don't process the doctype.
0772                 doctype = false;
0773             } else if (isWhitespace) {
0774                 // nothing
0775             } else {
0776                 doctypeToken.state = DoctypeBogus;
0777             }
0778             break;
0779         }
0780         case DoctypeSystemID: {
0781             if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
0782                 doctypeToken.state = DoctypeAfterSystemID;
0783                 doctypeAllowComment = true;
0784             } else if (c == '>') {
0785                 // Considered bogus. Don't process the doctype.
0786                 doctype = false;
0787             } else {
0788                 doctypeToken.systemID.append(c);
0789             }
0790             break;
0791         }
0792         case DoctypeAfterSystemID: {
0793             if (c == '>') {
0794                 // Valid doctype. Emit it now.
0795                 doctype = false;
0796                 processDoctypeToken();
0797             } else if (isWhitespace) {
0798                 // nothing
0799             } else if (c == '[') {
0800                 // Found internal subset
0801                 doctypeToken.state = DoctypeInternalSubset;
0802                 doctypeAllowComment = false;
0803             } else {
0804                 doctypeToken.state = DoctypeBogus;
0805             }
0806             break;
0807         }
0808         case DoctypeInternalSubset: {
0809             if (c == ']') {
0810                 // Done
0811                 doctypeToken.state = DoctypeAfterInternalSubset;
0812                 doctypeAllowComment = true;
0813             } else {
0814                 doctypeToken.internalSubset.append(c);
0815             }
0816             break;
0817         }
0818         case DoctypeAfterInternalSubset: {
0819             if (c == '>') {
0820                 // Valid doctype. Emit it now.
0821                 doctype = false;
0822                 processDoctypeToken();
0823             } else if (isWhitespace) {
0824                 // nothing
0825             } else {
0826                 doctypeToken.state = DoctypeBogus;
0827             }
0828             break;
0829         }
0830         case DoctypeBogus: {
0831             if (c == '>') {
0832                 // Done with the bogus doctype.
0833                 doctype = false;
0834             } else {
0835                 // Just keep scanning for '>'
0836             }
0837             break;
0838         }
0839         default:
0840             break;
0841         }
0842         if (!dontAdvance) {
0843             ++src;
0844         } else if (dontAdvance == 1) {
0845             continue;
0846         } else { // double dontAdvance++, do workaround
0847             doctypeComment = DoctypeCommentBogus;
0848         }
0849     }
0850 }
0851
0852 void HTMLTokenizer::parseServer(TokenizerString &src)
0853 {
0854     checkRawContentBuffer(src.length());
0855     while (!src.isEmpty()) {
0856         rawContent[ rawContentSize++ ] = *src;
0857         if (src->unicode() == '>' &&
0858                 rawContentSize > 1 && rawContent[rawContentSize - 2] == '%') {
0859             ++src;
0860             server = false;
0861             rawContentSize = 0;
0862             return; // Finished parsing server include
0863         }
0864         ++src;
0865     }
0866 }
0867
0868 void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
0869 {
0870     char oldchar = 0;
0871     while (!src.isEmpty()) {
0872         unsigned char chbegin = src->toLatin1();
0873         if (chbegin == '\'') {
0874             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
0875         } else if (chbegin == '\"') {
0876             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
0877         }
0878         // Look for '?>'
0879         // some crappy sites omit the "?" before it, so
0880         // we look for an unquoted '>' instead. (IE compatible)
0881         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
0882             // We got a '?>' sequence
0883             processingInstruction = false;
0884             ++src;
0885             discard = LFDiscard;
0886             return; // Finished parsing comment!
0887         }
0888         ++src;
0889         oldchar = chbegin;
0890     }
0891 }
0892
0893 void HTMLTokenizer::parseText(TokenizerString &src)
0894 {
0895     while (!src.isEmpty()) {
0896         // do we need to enlarge the buffer?
0897         checkBuffer();
0898
0899         // ascii is okay because we only do ascii comparisons
0900         unsigned char chbegin = src->toLatin1();
0901
0902         if (skipLF && (chbegin != '\n')) {
0903             skipLF = false;
0904         }
0905
0906         if (skipLF) {
0907             skipLF = false;
0908             ++src;
0909         } else if ((chbegin == '\n') || (chbegin == '\r')) {
0910             if (chbegin == '\r') {
0911                 skipLF = true;
0912             }
0913
0914             *dest++ = '\n';
0915             ++src;
0916         } else {
0917             *dest++ = *src;
0918             ++src;
0919         }
0920     }
0921 }
0922
0923 void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
0924 {
0925     if (start) {
0926         cBufferPos = 0;
0927         entityLen = 0;
0928         Entity = SearchEntity;
0929     }
0930
0931     while (!src.isEmpty()) {
0932         ushort cc = src->unicode();
0933         switch (Entity) {
0934         case NoEntity:
0935             return;
0936
0937             break;
0938         case SearchEntity:
0939             if (cc == '#') {
0940                 cBuffer[cBufferPos++] = cc;
0941                 ++src;
0942                 Entity = NumericSearch;
0943             } else {
0944                 Entity = EntityName;
0945             }
0946
0947             break;
0948
0949         case NumericSearch:
0950             if (cc == 'x' || cc == 'X') {
0951                 cBuffer[cBufferPos++] = cc;
0952                 ++src;
0953                 Entity = Hexadecimal;
0954             } else if (cc >= '0' && cc <= '9') {
0955                 Entity = Decimal;
0956             } else {
0957                 Entity = SearchSemicolon;
0958             }
0959
0960             break;
0961
0962         case Hexadecimal: {
0963             int uc = EntityChar.unicode();
0964             int ll = qMin<uint>(src.length(), 8);
0965             while (ll--) {
0966                 QChar csrc(src->toLower());
0967                 cc = csrc.cell();
0968
0969                 if (csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
0970                     break;
0971                 }
0972                 uc = uc * 16 + (cc - (cc < 'a' ? '0' : 'a' - 10));
0973                 cBuffer[cBufferPos++] = cc;
0974                 ++src;
0975             }
0976             EntityChar = QChar(uc);
0977             Entity = SearchSemicolon;
0978             break;
0979         }
0980         case Decimal: {
0981             int uc = EntityChar.unicode();
0982             int ll = qMin(src.length(), 9 - cBufferPos);
0983             while (ll--) {
0984                 cc = src->cell();
0985
0986                 if (src->row() || !(cc >= '0' && cc <= '9')) {
0987                     Entity = SearchSemicolon;
0988                     break;
0989                 }
0990
0991                 uc = uc * 10 + (cc - '0');
0992                 cBuffer[cBufferPos++] = cc;
0993                 ++src;
0994             }
0995             EntityChar = QChar(uc);
0996             if (cBufferPos == 9) {
0997                 Entity = SearchSemicolon;
0998             }
0999             break;
1000         }
1001         case EntityName: {
1002             int ll = qMin(src.length(), 9 - cBufferPos);
1003             while (ll--) {
1004                 QChar csrc = *src;
1005                 cc = csrc.cell();
1006
1007                 if (csrc.row() || !((cc >= 'a' && cc <= 'z') ||
1008                                     (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
1009                     Entity = SearchSemicolon;
1010                     break;
1011                 }
1012
1013                 cBuffer[cBufferPos++] = cc;
1014                 ++src;
1015
1016                 // be IE compatible and interpret even unterminated entities
1017                 // outside tags. like "foo &nbspstuff bla".
1018                 if (tag == NoTag) {
1019                     int code;
1020                     const bool found = kde_findEntity(cBuffer, cBufferPos, &code);
1021                     if (found && code < 256) {
1022                         EntityChar = code;
1023                         entityLen = cBufferPos;
1024                     }
1025                 }
1026             }
1027             if (cBufferPos == 9) {
1028                 Entity = SearchSemicolon;
1029             }
1030             if (Entity == SearchSemicolon) {
1031                 if (cBufferPos > 1) {
1032                     int code;
1033                     const bool found = kde_findEntity(cBuffer, cBufferPos, &code);
1034                     // IE only accepts unterminated entities < 256,
1035                     // Gecko accepts them all, but only outside tags
1036                     if (found && (tag == NoTag || code < 256 || *src == ';')) {
1037                         EntityChar = code;
1038                         entityLen = cBufferPos;
1039                     }
1040                 }
1041             }
1042             break;
1043         }
1044         case SearchSemicolon:
1045 #ifdef TOKEN_DEBUG
1046             qCDebug(KHTML_LOG) << "ENTITY " << EntityChar.unicode();
1047 #endif
1048             fixUpChar(EntityChar);
1049
1050             if (*src == ';') {
1051                 ++src;
1052             }
1053
1054             if (!EntityChar.isNull()) {
1055                 checkBuffer();
1056                 if (entityLen > 0 && entityLen < cBufferPos) {
1057                     int rem = cBufferPos - entityLen;
1058                     src.prepend(TokenizerString(QString::fromLatin1(cBuffer + entityLen, rem)));
1059                 }
1060                 src.push(EntityChar);
1061                 rawContentSinceLastEntity = -1;
1062             } else {
1063 #ifdef TOKEN_DEBUG
1064                 qCDebug(KHTML_LOG) << "unknown entity!";
1065 #endif
1066                 checkBuffer(11);
1067                 // ignore the sequence, add it to the buffer as plaintext
1068                 *dest++ = '&';
1069                 for (unsigned int i = 0; i < cBufferPos; i++) {
1070                     dest[i] = cBuffer[i];
1071                 }
1072                 dest += cBufferPos;
1073                 rawContentSinceLastEntity += cBufferPos + 1;
1074                 if (pre) {
1075                     prePos += cBufferPos + 1;
1076                 }
1077             }
1078
1079             Entity = NoEntity;
1080             EntityChar = QChar::Null;
1081             return;
1082         };
1083     }
1084 }
1085
1086 void HTMLTokenizer::parseTag(TokenizerString &src)
1087 {
1088     assert(!Entity);
1089     checkRawContentBuffer(src.length());
1090
1091     while (!src.isEmpty()) {
1092         checkBuffer();
1093 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1094         uint l = 0;
1095         while (l < src.length() && (src.toString()[l]).toLatin1() != '>') {
1096             l++;
1097         }
1098         qDebug("src is now: *%s*, tquote: %d", src.toString().left(l).toLatin1().constData(), tquote);
1099 #endif
1100         switch (tag) {
1101         case NoTag:
1102             return;
1103         case TagName: {
1104 #if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
1105             qDebug("TagName");
1106 #endif
1107             if (searchCount > 0) {
1108                 if (*src == commentStart[searchCount]) {
1109                     searchCount++;
1110                     if (searchCount == 2) {
1111                         doctypeSearchCount++;    // A '!' is also part of doctype, so we are moving through that still as well
1112                     } else {
1113                         doctypeSearchCount = 0;
1114                     }
1115
1116                     if (searchCount == 4) {
1117 #ifdef TOKEN_DEBUG
1118                         qCDebug(KHTML_LOG) << "Found comment";
1119 #endif
1120                         // Found '<!--' sequence
1121                         ++src;
1122                         dest = buffer; // ignore the previous part of this tag
1123                         tag = NoTag;
1124
1125                         comment = true;
1126                         parseComment(src);
1127                         return; // Finished parsing tag!
1128                     }
1129                     // cuts of high part, is okay
1130                     cBuffer[cBufferPos++] = src->cell();
1131                     ++src;
1132                     break;
1133                 } else {
1134                     searchCount = 0;    // Stop looking for '<!--' sequence
1135                 }
1136             }
1137
1138             if (doctypeSearchCount > 0) {
1139                 if ((*src).toLower() == doctypeStart[doctypeSearchCount]) {
1140                     doctypeSearchCount++;
1141                     cBuffer[cBufferPos++] = src->cell();
1142                     ++src;
1143                     if (doctypeSearchCount == 9) {
1144                         // Found '<!DOCTYPE' sequence
1145                         tag = NoTag;
1146                         doctypeAllowComment = true;
1147                         doctypeComment = NoDoctypeComment;
1148                         doctypeToken.reset();
1149                         doctype = true;
1150
1151                         parseDoctype(src);
1152                         return;
1153                     }
1154                     break;
1155                 } else {
1156                     doctypeSearchCount = 0;    // Stop looking for '<!DOCTYPE' sequence
1157                 }
1158             }
1159
1160             bool finish = false;
1161             unsigned int ll = qMin(src.length(), CBUFLEN - cBufferPos);
1162             while (ll--) {
1163                 ushort curchar = src->unicode();
1164                 if (curchar <= ' ' || curchar == '>') {
1165                     finish = true;
1166                     break;
1167                 }
1168                 // this is a nasty performance trick. will work for the A-Z
1169                 // characters, but not for others. if it contains one,
1170                 // we fail anyway
1171                 char cc = curchar;
1172                 cBuffer[cBufferPos++] = cc | 0x20;
1173                 ++src;
1174             }
1175
1176             // Disadvantage: we add the possible rest of the tag
1177             // as attribute names. ### judge if this causes problems
1178             if (finish || CBUFLEN == cBufferPos) {
1179                 bool beginTag;
1180                 char *ptr = cBuffer;
1181                 unsigned int len = cBufferPos;
1182                 cBuffer[cBufferPos] = '\0';
1183                 if ((cBufferPos > 0) && (*ptr == '/')) {
1184                     // End Tag
1185                     beginTag = false;
1186                     ptr++;
1187                     len--;
1188                 } else
1189                     // Start Tag
1190                 {
1191                     beginTag = true;
1192                 }
1193                 // Accept empty xml tags like <br/>
1194                 if (len > 1 && ptr[len - 1] == '/') {
1195                     ptr[--len] = '\0';
1196                     // if it is like <br/> and not like <input/ value=foo>, take it as flat
1197                     if (*src == '>') {
1198                         currToken.flat = true;
1199                     }
1200                 }
1201
1202                 uint tagID = 0;
1203                 if (!tagID) {
1204                     DOMString tagName(ptr);
1205                     if (Element::khtmlValidQualifiedName(tagName)) {
1206                         safeLocalName = LocalName::fromString(tagName, IDS_NormalizeLower);
1207                         tagID = safeLocalName.id();
1208                     }
1209 #ifdef TOKEN_DEBUG
1210                     QByteArray tmp(ptr, len + 1);
1211                     qCDebug(KHTML_LOG) << "Unknown tag: \"" << tmp.data() << "\"";
1212 #endif
1213                 }
1214                 if (tagID) {
1215 #ifdef TOKEN_DEBUG
1216                     QByteArray tmp(ptr, len + 1);
1217                     qCDebug(KHTML_LOG) << "found tag id=" << tagID << ": " << tmp.data();
1218 #endif
1219                     currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG;
1220                 }
1221                 dest = buffer;
1222                 tag = SearchAttribute;
1223                 cBufferPos = 0;
1224             }
1225             break;
1226         }
1227         case SearchAttribute: {
1228 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1229             qDebug("SearchAttribute");
1230 #endif
1231             bool atespace = false;
1232             ushort curchar;
1233             while (!src.isEmpty()) {
1234                 curchar = src->unicode();
1235                 if (curchar > ' ') {
1236                     if (curchar == '<' || curchar == '>') {
1237                         tag = SearchEnd;
1238                     } else if (atespace && (curchar == '\'' || curchar == '"')) {
1239                         tag = SearchValue;
1240                         *dest++ = 0;
1241                         attrName = DOMString("");
1242                     } else {
1243                         tag = AttributeName;
1244                     }
1245
1246                     cBufferPos = 0;
1247                     break;
1248                 }
1249                 atespace = true;
1250                 ++src;
1251             }
1252             break;
1253         }
1254         case AttributeName: {
1255 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1256             qDebug("AttributeName");
1257 #endif
1258             ushort curchar;
1259             int ll = qMin(src.length(), CBUFLEN - cBufferPos);
1260
1261             while (ll--) {
1262                 curchar = src->unicode();
1263                 if (curchar <= '>') {
1264                     if (curchar <= ' ' || curchar == '=' || curchar == '>') {
1265                         unsigned int a;
1266                         cBuffer[cBufferPos] = '\0';
1267                         a = LocalName::fromString(DOMString(cBuffer), IDS_NormalizeLower).id(); // ### still deep copy?
1268                         if (a > ATTR_LAST_ATTR) {
1269                             a = 0;
1270                         }
1271
1272                         if (!a) {
1273                             // did we just get /> or e.g checked/>
1274                             if (curchar == '>' && cBufferPos >= 1 && cBuffer[cBufferPos - 1] == '/') {
1275                                 currToken.flat = true;
1276                                 cBuffer[cBufferPos - 1] = '\0';
1277                                 if (cBufferPos > 1) {
1278                                     a = LocalName::fromString(DOMString(cBuffer), IDS_NormalizeLower).id();
1279                                 }
1280                                 if (a > ATTR_LAST_ATTR) {
1281                                     a = 0;
1282                                 }
1283                                 cBuffer[cBufferPos - 1] = '/';
1284                             }
1285                             if (!a) {
1286                                 attrName = DOMString(cBuffer, cBufferPos);
1287                             }
1288                         }
1289
1290                         dest = buffer;
1291                         *dest++ = a;
1292 #ifdef TOKEN_DEBUG
1293                         if (!a || (cBufferPos && *cBuffer == '!')) {
1294                             qCDebug(KHTML_LOG) << "Unknown attribute: *" << QByteArray(cBuffer, cBufferPos + 1).data() << "*";
1295                         } else {
1296                             qCDebug(KHTML_LOG) << "Known attribute: " << QByteArray(cBuffer, cBufferPos + 1).data();
1297                         }
1298 #endif
1299
1300                         tag = SearchEqual;
1301                         break;
1302                     }
1303                 }
1304                 cBuffer[cBufferPos++] =
1305                     (curchar >= 'A' && curchar <= 'Z') ? curchar | 0x20 : curchar;
1306                 ++src;
1307             }
1308             if (cBufferPos == CBUFLEN) {
1309                 cBuffer[cBufferPos] = '\0';
1310                 attrName = DOMString(cBuffer, cBufferPos);
1311                 dest = buffer;
1312                 *dest++ = 0;
1313                 tag = SearchEqual;
1314             }
1315             break;
1316         }
1317         case SearchEqual: {
1318 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1319             qDebug("SearchEqual");
1320 #endif
1321             ushort curchar;
1322             bool atespace = false;
1323             while (!src.isEmpty()) {
1324                 curchar = src->unicode();
1325                 if (curchar > ' ') {
1326                     if (curchar == '=') {
1327 #ifdef TOKEN_DEBUG
1328                         qCDebug(KHTML_LOG) << "found equal";
1329 #endif
1330                         tag = SearchValue;
1331                         ++src;
1332                     } else if (atespace && (curchar == '\'' || curchar == '"')) {
1333                         tag = SearchValue;
1334                         *dest++ = 0;
1335                         attrName = DOMString("");
1336                     } else {
1337                         DOMString v("");
1338                         currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1339                         dest = buffer;
1340                         tag = SearchAttribute;
1341                     }
1342                     break;
1343                 }
1344                 atespace = true;
1345                 ++src;
1346             }
1347             break;
1348         }
1349         case SearchValue: {
1350             ushort curchar;
1351             while (!src.isEmpty()) {
1352                 curchar = src->unicode();
1353                 if (curchar > ' ') {
1354                     if ((curchar == '\'' || curchar == '\"')) {
1355                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
1356                         tag = QuotedValue;
1357                         ++src;
1358                     } else {
1359                         tag = Value;
1360                     }
1361
1362                     break;
1363                 }
1364                 ++src;
1365             }
1366             break;
1367         }
1368         case QuotedValue: {
1369 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1370             qDebug("QuotedValue");
1371 #endif
1372             ushort curchar;
1373             while (!src.isEmpty()) {
1374                 checkBuffer();
1375
1376                 curchar = src->unicode();
1377                 if (curchar <= '\'' && !src.escaped()) {
1378                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
1379                     if (curchar == '&') {
1380                         ++src;
1381                         parseEntity(src, dest, true);
1382                         break;
1383                     } else if ((tquote == SingleQuote && curchar == '\'') ||
1384                                (tquote == DoubleQuote && curchar == '\"')) {
1385                         // some <input type=hidden> rely on trailing spaces. argh
1386                         while (dest > buffer + 1 && (*(dest - 1) == '\n' || *(dest - 1) == '\r')) {
1387                             dest--;    // remove trailing newlines
1388                         }
1389                         DOMString v(buffer + 1, dest - buffer - 1);
1390                         currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1391
1392                         dest = buffer;
1393                         tag = SearchAttribute;
1394                         tquote = NoQuote;
1395                         ++src;
1396                         break;
1397                     }
1398                 }
1399                 *dest++ = *src;
1400                 ++src;
1401             }
1402             break;
1403         }
1404         case Value: {
1405 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1406             qDebug("Value");
1407 #endif
1408             ushort curchar;
1409             while (!src.isEmpty()) {
1410                 checkBuffer();
1411                 curchar = src->unicode();
1412                 if (curchar <= '>' && !src.escaped()) {
1413                     // parse Entities
1414                     if (curchar == '&') {
1415                         ++src;
1416                         parseEntity(src, dest, true);
1417                         break;
1418                     }
1419                     // no quotes. Every space means end of value
1420                     // '/' does not delimit in IE!
1421                     // HTML5: must not contain any literal space characters, any U+0022 QUOTATION MARK (") characters,
1422                     // U+0027 APOSTROPHE (') characters, U+003D EQUALS SIGN (=) characters, U+003C LESS-THAN SIGN (<) characters,
1423                     // U+003E GREATER-THAN SIGN (>) characters, or U+0060 GRAVE ACCENT (`) characters, and must not be the empty string.
1424                     // Real life: images.google.com uses URLs including form arguments (foo=bar)
1425                     // in unquoted parameters --- with an html5 <!doctype html> DTD.
1426                     // Real life takes priority, so we accept at least =
1427                     if (curchar <= ' ' || curchar == '>' || curchar == '\'' || curchar == '"' || curchar == '<' || /*curchar == '=' ||*/ curchar == '`') {
1428                         DOMString v(buffer + 1, dest - buffer - 1);
1429                         currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
1430                         dest = buffer;
1431                         tag = SearchAttribute;
1432                         break;
1433                     }
1434                 }
1435
1436                 *dest++ = *src;
1437                 ++src;
1438             }
1439             break;
1440         }
1441         case SearchEnd: {
1442 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
1443             qDebug("SearchEnd");
1444 #endif
1445             while (!src.isEmpty()) {
1446                 if (*src == '<' || *src == '>') {
1447                     break;
1448                 }
1449
1450                 if (*src == '/') {
1451                     currToken.flat = true;
1452                 }
1453
1454                 ++src;
1455             }
1456             if (src.isEmpty() && *src != '<' && *src != '>') {
1457                 break;
1458             }
1459
1460             searchCount = 0; // Stop looking for '<!--' sequence
1461             tag = NoTag;
1462             tquote = NoQuote;
1463             if (*src == '>') {
1464                 ++src;
1465             }
1466
1467             if (!currToken.tid) { //stop if tag is unknown
1468                 return;
1469             }
1470
1471             uint tagID = currToken.tid;
1472 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
1473             qCDebug(KHTML_LOG) << "appending Tag: " << tagID;
1474 #endif
1475             // When parsing HTML flat tags like <div /> should
1476             // be ignored, the only exception is SCRIPT, and
1477             // tags with forbidden end-tags
1478             if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT &&
1479                     DOM::endTagRequirement(tagID) != DOM::FORBIDDEN &&
1480                     parser->doc()->htmlMode() != DocumentImpl::XHtml) {
1481                 currToken.flat = false;
1482             }
1483
1484             bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
1485             HTMLScriptElementImpl *prevScriptElem = nullptr;
1486
1487             if (tagID >= ID_CLOSE_TAG) {
1488                 tagID -= ID_CLOSE_TAG;
1489             } else if (tagID == ID_SCRIPT) {
1490                 prevScriptElem = parser->currentScriptElement();
1491                 DOMStringImpl *a = nullptr;
1492                 scriptSrc.clear(); scriptSrcCharset.clear();
1493                 if (currToken.attrs &&  /* potentially have a ATTR_SRC ? */
1494                         view &&  /* are we a regular tokenizer or just for innerHTML ? */
1495                         parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
1496                    ) {
1497                     if ((a = currToken.attrs->getValue(ATTR_SRC))) {
1498                         scriptSrc = DOMString(a).trimSpaces().string();
1499                     }
1500                     if ((a = currToken.attrs->getValue(ATTR_CHARSET))) {
1501                         scriptSrcCharset = DOMString(a).string().trimmed();
1502                     }
1503                     if (scriptSrcCharset.isEmpty() && view) {
1504                         scriptSrcCharset = parser->doc()->view()->part()->encoding();
1505                     }
1506                 }
1507                 javascript = true;
1508             }
1509
1510             processToken();
1511
1512             if (javascript) {
1513                 HTMLScriptElementImpl *sc = parser->currentScriptElement();
1514                 javascript = (sc && sc != prevScriptElem) ? sc->isValidScript() : false;
1515             }
1516
1517             if (parser->selectMode() && beginTag) {
1518                 discard = AllDiscard;
1519             }
1520
1521             switch (tagID) {
1522             case ID_LISTING:
1523             case ID_PRE:
1524                 pre = beginTag;
1525                 if (beginTag) {
1526                     discard = LFDiscard;
1527                 }
1528                 prePos = 0;
1529                 break;
1530             case ID_BR:
1531                 prePos = 0;
1532                 break;
1533             case ID_SCRIPT:
1534                 if (beginTag) {
1535                     searchStopper = scriptEnd;
1536                     searchStopperLen = 8;
1537                     script = true;
1538                     parseRawContent(src);
1539                 } else if (tagID < ID_CLOSE_TAG) { // Handle <script src="foo"/>
1540                     script = true;
1541                     scriptHandler();
1542                 }
1543                 break;
1544             case ID_STYLE:
1545                 if (beginTag) {
1546                     searchStopper = styleEnd;
1547                     searchStopperLen = 7;
1548                     style = true;
1549                     parseRawContent(src);
1550                 }
1551                 break;
1552             case ID_TEXTAREA:
1553                 if (beginTag) {
1554                     searchStopper = textareaEnd;
1555                     searchStopperLen = 10;
1556                     textarea = true;
1557                     discard = NoneDiscard;
1558                     rawContentSinceLastEntity = 0;
1559                     parseRawContent(src);
1560                 }
1561                 break;
1562             case ID_TITLE:
1563                 if (beginTag) {
1564                     searchStopper = titleEnd;
1565                     searchStopperLen = 7;
1566                     title = true;
1567                     rawContentSinceLastEntity = 0;
1568                     parseRawContent(src);
1569                 }
1570                 break;
1571             case ID_XMP:
1572                 if (beginTag) {
1573                     searchStopper = xmpEnd;
1574                     searchStopperLen = 5;
1575                     xmp = true;
1576                     parseRawContent(src);
1577                 }
1578                 break;
1579             case ID_SELECT:
1580                 select = beginTag;
1581                 break;
1582             case ID_PLAINTEXT:
1583                 plaintext = beginTag;
1584                 break;
1585             }
1586             return; // Finished parsing tag!
1587         }
1588         } // end switch
1589     }
1590     return;
1591 }
1592
1593 void HTMLTokenizer::addPending()
1594 {
1595     if (select && !(comment || script)) {
1596         *dest++ = ' ';
1597     } else {
1598         switch (pending) {
1599         case LFPending:  *dest++ = QLatin1Char('\n'); prePos = 0; break;
1600         case SpacePending: *dest++ = QLatin1Char(' '); ++prePos; break;
1601         case TabPending: {
1602             // Don't expand tabs inside <textarea> or script
1603             int p = TAB_SIZE - (prePos % TAB_SIZE);
1604             if (textarea || script) {
1605                 *dest++ = QLatin1Char('\t');
1606             } else {
1607                 for (int x = 0; x < p; x++) {
1608                     *dest++ = QLatin1Char(' ');
1609                 }
1610             }
1611             prePos += p;
1612             break;
1613         }
1614         case NonePending:
1615             assert(0);
1616         }
1617     }
1618
1619     pending = NonePending;
1620 }
1621
1622 inline bool HTMLTokenizer::continueProcessing(int &processedCount)
1623 {
1624     // We don't want to be checking elapsed time with every character, so we only check after we've
1625     // processed a certain number of characters. We also do not do suspension if we're
1626     // parsing something like innerHTML.
1627     if (!m_executingScript && processedCount > sTokenizerChunkSize && cachedScript.isEmpty()) {
1628         processedCount = 0;
1629         if (m_time.elapsed() > m_tokenizerYieldDelay && m_documentTokenizer) {
1630             m_yieldTimer = startTimer(0);
1631             m_tokenizerYieldDelay = sTokenizerFastYieldDelay;
1632             return false;
1633         }
1634     }
1635     processedCount++;
1636     return true;
1637 }
1638
1639 #include "khtmlpart_p.h"
1640 void HTMLTokenizer::write(const TokenizerString &str, bool appendData)
1641 {
1642 #ifdef TOKEN_DEBUG
1643     qCDebug(KHTML_LOG) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")";
1644 #endif
1645     if (!buffer) {
1646         return;
1647     }
1648
1649     if ((m_executingScript && appendData) || cachedScript.count()) {
1650         // don't parse; we will do this later
1651         if (pendingQueue.isEmpty()) {
1652             pendingQueue.push(str);
1653         } else if (appendData) {
1654             pendingQueue.bottom().append(str);
1655         } else {
1656             pendingQueue.top().append(str);
1657         }
1658 #if PROSPECTIVE_TOKENIZER_ENABLED
1659         if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData) {
1660             m_prospectiveTokenizer->write(str);
1661         }
1662 #endif
1663         return;
1664     }
1665
1666 #if PROSPECTIVE_TOKENIZER_ENABLED
1667     if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData) {
1668         m_prospectiveTokenizer->end();
1669     }
1670 #endif
1671
1672     if (onHold) {
1673         src.append(str);
1674         return;
1675     }
1676
1677     if (!src.isEmpty()) {
1678         src.append(str);
1679     } else {
1680         setSrc(str);
1681     }
1682
1683     // Once a timer is set, it has control of when the tokenizer continues.
1684     if (m_yieldTimer > 0) {
1685         return;
1686     }
1687
1688     int processedCount = 0;
1689     m_time.start();
1690
1691     while (!src.isEmpty()) {
1692         if (m_abort || !continueProcessing(processedCount)) {
1693             break;
1694         }
1695         // do we need to enlarge the buffer?
1696         checkBuffer();
1697
1698         ushort cc = src->unicode();
1699
1700         if (skipLF && (cc != '\n')) {
1701             skipLF = false;
1702         }
1703
1704         if (skipLF) {
1705             skipLF = false;
1706             ++src;
1707         } else if (Entity) {
1708             parseEntity(src, dest);
1709         } else if (plaintext) {
1710             parseText(src);
1711         } else if (script) {
1712             parseRawContent(src);
1713         } else if (style) {
1714             parseRawContent(src);
1715         } else if (xmp) {
1716             parseRawContent(src);
1717         } else if (textarea) {
1718             parseRawContent(src);
1719         } else if (title) {
1720             parseRawContent(src);
1721         } else if (comment) {
1722             parseComment(src);
1723         } else if (doctypeComment && doctypeComment != DoctypeCommentEnd && doctypeComment != DoctypeCommentBogus) {
1724             parseDoctypeComment(src);
1725         } else if (doctype) {
1726             parseDoctype(src);
1727         } else if (server) {
1728             parseServer(src);
1729         } else if (processingInstruction) {
1730             parseProcessingInstruction(src);
1731         } else if (tag) {
1732             parseTag(src);
1733         } else if (startTag) {
1734             startTag = false;
1735
1736             switch (cc) {
1737             case '/':
1738                 break;
1739             case '!': {
1740                 // <!-- comment --> or <!DOCTYPE ...>
1741                 searchCount = 1; // Look for '<!--' sequence to start comment...
1742                 doctypeSearchCount = 1; // ... or for '<!DOCTYPE' sequence to start doctype
1743                 break;
1744             }
1745             case '?': {
1746                 // xml processing instruction
1747                 processingInstruction = true;
1748                 tquote = NoQuote;
1749                 parseProcessingInstruction(src);
1750                 continue;
1751             }
1752             case '%':
1753                 if (!brokenServer) {
1754                     // <% server stuff, handle as comment %>
1755                     server = true;
1756                     tquote = NoQuote;
1757                     parseServer(src);
1758                     continue;
1759                 }
1760             // else fall through
1761             default: {
1762                 if (((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
1763                     // Start of a Start-Tag
1764                 } else {
1765                     // Invalid tag
1766                     // Add as is
1767                     if (pending) {
1768                         addPending();
1769                     }
1770                     *dest = '<';
1771                     dest++;
1772                     continue;
1773                 }
1774             }
1775             }; // end case
1776
1777             // According to SGML any LF immediately after a starttag, or
1778             // immediately before an endtag should be ignored.
1779             // ### Gecko and MSIE though only ignores LF immediately after
1780             // starttags and only for PRE elements -- asj (28/06-2005)
1781             if (pending) {
1782                 if (!select) {
1783                     addPending();
1784                 } else {
1785                     pending = NonePending;
1786                 }
1787             }
1788
1789             // Cancel unused discards
1790             discard = NoneDiscard;
1791             // if (!endTag) discard = LFDiscard;
1792
1793             processToken();
1794
1795             cBufferPos = 0;
1796             tag = TagName;
1797             parseTag(src);
1798         } else if (cc == '&' && !src.escaped()) {
1799             ++src;
1800             if (pending) {
1801                 addPending();
1802             }
1803             discard = NoneDiscard;
1804             parseEntity(src, dest, true);
1805         } else if (cc == '<' && !src.escaped()) {
1806             tagStartLineno = lineno + src.lineCount();
1807             ++src;
1808             discard = NoneDiscard;
1809             startTag = true;
1810         } else if ((cc == '\n') || (cc == '\r')) {
1811             if (discard == SpaceDiscard) {
1812                 discard = NoneDiscard;
1813             }
1814
1815             if (discard == LFDiscard) {
1816                 // Ignore one LF
1817                 discard = NoneDiscard;
1818             } else if (discard == AllDiscard) {
1819                 // Ignore
1820             } else {
1821                 if (select && !script) {
1822                     pending = LFPending;
1823                 } else {
1824                     if (pending) {
1825                         addPending();
1826                     }
1827                     pending = LFPending;
1828                 }
1829             }
1830
1831             /* Check for MS-DOS CRLF sequence */
1832             if (cc == '\r') {
1833                 skipLF = true;
1834             }
1835             ++src;
1836         } else if ((cc == ' ') || (cc == '\t')) {
1837             if (discard == LFDiscard) {
1838                 discard = NoneDiscard;
1839             }
1840
1841             if (discard == SpaceDiscard) {
1842                 // Ignore one space
1843                 discard = NoneDiscard;
1844             } else if (discard == AllDiscard) {
1845                 // Ignore
1846             } else {
1847                 if (select && !script) {
1848                     if (!pending) {
1849                         pending = SpacePending;
1850                     }
1851                 } else {
1852                     if (pending) {
1853                         addPending();
1854                     }
1855                     if (cc == ' ') {
1856                         pending = SpacePending;
1857                     } else {
1858                         pending = TabPending;
1859                     }
1860                 }
1861             }
1862
1863             ++src;
1864         } else {
1865             if (pending) {
1866                 addPending();
1867             }
1868
1869             discard = NoneDiscard;
1870             if (pre) {
1871                 prePos++;
1872             }
1873             *dest = *src;
1874             fixUpChar(*dest);
1875             ++dest;
1876             ++src;
1877         }
1878     }
1879
1880     if (noMoreData && cachedScript.isEmpty() && !m_executingScript && m_yieldTimer <= 0) {
1881         end();    // this actually causes us to be deleted
1882     }
1883 }
1884
1885 void HTMLTokenizer::timerEvent(QTimerEvent *e)
1886 {
1887     if (e->timerId() == m_yieldTimer) {
1888         killTimer(m_yieldTimer);
1889         m_yieldTimer = 0;
1890         write(TokenizerString(), true);
1891     } else if (e->timerId() == m_externalScriptsTimerId) {
1892         if (view && view->hasLayoutPending()) {
1893             // all stylesheets are loaded but the style modifications
1894             // they triggered have yet to be applied, BBIAB
1895             return;
1896         }
1897         killTimer(m_externalScriptsTimerId);
1898         m_externalScriptsTimerId = 0;
1899         notifyFinished(nullptr);
1900     }
1901 }
1902
1903 void HTMLTokenizer::end()
1904 {
1905     if (buffer) {
1906         // parseTag is using the buffer for different matters
1907         if (!tag) {
1908             processToken();
1909         }
1910
1911         if (buffer) {
1912             KHTML_DELETE_QCHAR_VEC(buffer);
1913         }
1914
1915         if (rawContent) {
1916             KHTML_DELETE_QCHAR_VEC(rawContent);
1917         }
1918
1919         rawContent = nullptr;
1920         rawContentSize = rawContentMaxSize = rawContentResync = 0;
1921         buffer = nullptr;
1922     }
1923     emit finishedParsing();
1924 }
1925
1926 void HTMLTokenizer::finish()
1927 {
1928     // The purpose of this iteration is to recover from 'raw content' tokenizing mode.
1929     // In this mode, any error such as the lack of a closing tag (for the considered element) or of a closing comment,
1930     // would result in the entire document being absorbed in one node.
1931     // When it happens, we simply put back in the input buffer what this mode's output has accumulated so far,
1932     // and retokenize after either disabling the 'raw content' mode (by setting the corresponding members to false)
1933     // or after setting a few flags disabling some lax parsing 'features' (brokenComments/brokenServer).
1934     while ((title || comment || server) && rawContent && rawContentSize) {
1935         // we've found an unmatched comment start
1936         if (comment) {
1937             brokenComments = true;
1938         } else if (server) {
1939             brokenServer = true;
1940         }
1941
1942         checkRawContentBuffer();
1943         rawContent[ rawContentSize ] = 0;
1944         rawContent[ rawContentSize + 1 ] = 0;
1945         int pos;
1946         QString food;
1947         if (title || style || script || textarea) {
1948             rawContentSinceLastEntity = 0;
1949             food.setUnicode(rawContent, rawContentSize);
1950         } else if (server) {
1951             food = "<";
1952             food += QString(rawContent, rawContentSize);
1953         } else {
1954             pos = QString::fromRawData(rawContent, rawContentSize).indexOf('>');
1955             food.setUnicode(rawContent + pos + 1, rawContentSize - pos - 1); // deep copy
1956         }
1957         KHTML_DELETE_QCHAR_VEC(rawContent);
1958         rawContent = nullptr;
1959         rawContentSize = rawContentMaxSize = rawContentResync = 0;
1960
1961         comment = server = title = false;
1962         if (!food.isEmpty()) {
1963             write(food, true);
1964         }
1965     }
1966     // this indicates we will not receive any more data... but if we are waiting on
1967     // an external script to load, we can't finish parsing until that is done
1968     noMoreData = true;
1969     if (cachedScript.isEmpty() && !m_executingScript && !onHold && m_yieldTimer <= 0) {
1970         end();    // this actually causes us to be deleted
1971     }
1972 }
1973
1974 void HTMLTokenizer::processToken()
1975 {
1976     KJSProxy *jsProxy = view ? view->part()->jScript() : nullptr;
1977     if (jsProxy) {
1978         jsProxy->setEventHandlerLineno(tagStartLineno);
1979     }
1980     if (dest > buffer) {
1981 #if 0
1982         if (currToken.tid) {
1983             qDebug("unexpected token id: %d, str: *%s*", currToken.tid, QString::fromRawData(buffer, dest - buffer).toLatin1().constData());
1984             assert(0);
1985         }
1986
1987 #endif
1988         currToken.text = new DOMStringImpl(buffer, dest - buffer);
1989         currToken.text->ref();
1990         if (currToken.tid != ID_COMMENT) {
1991             currToken.tid = ID_TEXT;
1992         }
1993     } else if (!currToken.tid) {
1994         currToken.reset();
1995         if (jsProxy) {
1996             jsProxy->setEventHandlerLineno(lineno + src.lineCount());
1997         }
1998         return;
1999     }
2000
2001     dest = buffer;
2002
2003 #ifdef TOKEN_DEBUG
2004     QString text;
2005     bool closing = (currToken.tid > ID_CLOSE_TAG);
2006     int rid = currToken.tid - (closing ? ID_CLOSE_TAG : 0);
2007     if (currToken.text) {
2008         text = QString::fromRawData(currToken.text->s, currToken.text->l);
2009     }
2010     qCDebug(KHTML_LOG) << "Token -->" << LocalName::fromId(localNamePart(rid)).toString()
2011              << "id =" << currToken.tid << "closing =" << closing;
2012     if (currToken.flat) {
2013         qCDebug(KHTML_LOG) << "Token is FLAT!";
2014     }
2015     if (!text.isNull()) {
2016         qCDebug(KHTML_LOG) << "text: \"" << text << "\"";
2017     }
2018     unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
2019
2020     if (l) {
2021         qCDebug(KHTML_LOG) << "Attributes: " << l;
2022         for (unsigned long i = 0; i < l; ++i) {
2023             NodeImpl::Id tid = currToken.attrs->idAt(i);
2024             DOMString value = currToken.attrs->valueAt(i);
2025             qCDebug(KHTML_LOG) << "    " << tid << " " << LocalName::fromId(localNamePart(tid)).toString()
2026                      << "=\"" << value.string() << "\"";
2027         }
2028     }
2029 #endif
2030
2031     // In some cases, parseToken() can cause javascript code to be executed
2032     // (for example, when setting an attribute that causes an event handler
2033     // to be created). So we need to protect against re-entrancy into the parser
2034     m_executingScript++;
2035
2036     // pass the token over to the parser, the parser DOES NOT delete the token
2037     parser->parseToken(&currToken);
2038
2039     m_executingScript--;
2040
2041     if (currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces()) {
2042         discard = NoneDiscard;
2043     }
2044
2045     currToken.reset();
2046     if (jsProxy) {
2047         jsProxy->setEventHandlerLineno(0);
2048     }
2049 }
2050
2051 void HTMLTokenizer::processDoctypeToken()
2052 {
2053     // qCDebug(KHTML_LOG) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID;
2054     doctypeToken.publicID = doctypeToken.publicID.simplified();
2055     doctypeToken.systemID = doctypeToken.systemID.simplified();
2056     parser->parseDoctypeToken(&doctypeToken);
2057 }
2058
2059 HTMLTokenizer::~HTMLTokenizer()
2060 {
2061     reset();
2062     delete m_prospectiveTokenizer;
2063     delete parser;
2064 }
2065
2066 void HTMLTokenizer::enlargeBuffer(int len)
2067 {
2068     int newsize = qMax(size * 2, size + len);
2069     int oldoffs = (dest - buffer);
2070
2071     buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize);
2072     dest = buffer + oldoffs;
2073     size = newsize;
2074 }
2075
2076 void HTMLTokenizer::enlargeRawContentBuffer(int len)
2077 {
2078     int newsize = qMax(rawContentMaxSize * 2, rawContentMaxSize + len);
2079     rawContent = KHTML_REALLOC_QCHAR_VEC(rawContent, newsize);
2080     rawContentMaxSize = newsize;
2081 }
2082
2083 void HTMLTokenizer::notifyFinished(CachedObject *finishedObj)
2084 {
2085     Q_UNUSED(finishedObj);
2086     assert(!cachedScript.isEmpty());
2087     // Make external scripts wait for external stylesheets.
2088     // FIXME: This needs to be done for inline scripts too.
2089     m_hasScriptsWaitingForStylesheets = !parser->doc()->haveStylesheetsLoaded();
2090     if (m_hasScriptsWaitingForStylesheets) {
2091         // qCDebug(KHTML_LOG) << "Delaying script execution until stylesheets have loaded.";
2092         return;
2093     }
2094     // qCDebug(KHTML_LOG) << (finishedObj ? "Processing an external script"  : "Continuing processing of delayed external scripts");
2095
2096     bool done = false;
2097     m_scriptTime.start();
2098     while (!done && cachedScript.head()->isLoaded()) {
2099         if (!continueProcessingScripts()) {
2100             break;
2101         }
2102
2103         CachedScript *cs = cachedScript.dequeue();
2104         DOMString scriptSource = cs->script();
2105 #ifdef TOKEN_DEBUG
2106         qCDebug(KHTML_LOG) << "External script is:" << endl << scriptSource.string();
2107 #endif
2108         setSrc(TokenizerString());
2109
2110         // make sure we forget about the script before we execute the new one
2111         // infinite recursion might happen otherwise
2112         QString cachedScriptUrl(cs->url().string());
2113         cs->deref(this);
2114
2115         scriptExecution(scriptSource.string(), cachedScriptUrl);
2116
2117         done = cachedScript.isEmpty();
2118         if (done) {
2119             assert(!m_hasScriptsWaitingForStylesheets);
2120         } else if (m_hasScriptsWaitingForStylesheets) {
2121             // flag has changed during the script execution,
2122             // so we need to wait for stylesheets again.
2123             done = true;
2124         }
2125         // 'script' is true when we are called synchronously from
2126         // scriptHandler(). In that case scriptHandler() will take care
2127         // of the pending queue.
2128         if (!script) {
2129             while (pendingQueue.count() > 1) {
2130                 TokenizerString t = pendingQueue.pop();
2131                 pendingQueue.top().prepend(t);
2132             }
2133             if (done) {
2134                 write(pendingQueue.pop(), false);
2135             }
2136             // we might be deleted at this point, do not
2137             // access any members.
2138         }
2139     }
2140 }
2141
2142 bool HTMLTokenizer::continueProcessingScripts()
2143 {
2144     if (m_externalScriptsTimerId) {
2145         return false;
2146     }
2147     if (m_scriptTime.elapsed() > m_tokenizerYieldDelay && m_documentTokenizer) {
2148         if ((m_externalScriptsTimerId = startTimer(0))) {
2149             return false;
2150         }
2151     }
2152     return true;
2153 }
2154
2155 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
2156 {
2157     assert(parser->doc()->haveStylesheetsLoaded());
2158     if (m_hasScriptsWaitingForStylesheets) {
2159         notifyFinished(nullptr);
2160     }
2161 }
2162
2163 bool HTMLTokenizer::isWaitingForScripts() const
2164 {
2165     return cachedScript.count();
2166 }
2167
2168 bool HTMLTokenizer::isExecutingScript() const
2169 {
2170     return (m_executingScript > 0);
2171 }
2172
2173 void HTMLTokenizer::setSrc(const TokenizerString &source)
2174 {
2175     lineno += src.lineCount();
2176     src = source;
2177     src.resetLineCount();
2178 }
2179
2180 void HTMLTokenizer::setOnHold(bool _onHold)
2181 {
2182     if (onHold == _onHold) {
2183         return;
2184     }
2185     onHold = _onHold;
2186 }
2187