File indexing completed on 2024-05-05 16:10:18
0001 /* 0002 This file is part of the KDE libraries 0003 0004 Copyright (C) 1997 Martin Jones (mjones@kde.org) 0005 (C) 1997 Torben Weis (weis@kde.org) 0006 (C) 1998 Waldo Bastian (bastian@kde.org) 0007 (C) 1999 Lars Knoll (knoll@kde.org) 0008 (C) 1999 Antti Koivisto (koivisto@kde.org) 0009 (C) 2001-2003 Dirk Mueller (mueller@kde.org) 0010 (C) 2004-2008 Apple Computer, Inc. 0011 (C) 2006-2008 Germain Garand (germain@ebooksfrance.org) 0012 0013 This library is free software; you can redistribute it and/or 0014 modify it under the terms of the GNU Library General Public 0015 License as published by the Free Software Foundation; either 0016 version 2 of the License, or (at your option) any later version. 0017 0018 This library is distributed in the hope that it will be useful, 0019 but WITHOUT ANY WARRANTY; without even the implied warranty of 0020 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0021 Library General Public License for more details. 0022 0023 You should have received a copy of the GNU Library General Public License 0024 along with this library; see the file COPYING.LIB. If not, write to 0025 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0026 Boston, MA 02110-1301, USA. 0027 */ 0028 //---------------------------------------------------------------------------- 0029 // 0030 // KDE HTML Widget - Tokenizers 0031 0032 // #define TOKEN_DEBUG 1 0033 // #define TOKEN_DEBUG 2 0034 0035 #include "htmltokenizer.h" 0036 #include "html_documentimpl.h" 0037 #include "htmlparser.h" 0038 #include "dtd.h" 0039 0040 #include <misc/loader.h> 0041 0042 #include <khtmlview.h> 0043 #include <khtml_part.h> 0044 #include <xml/dom_docimpl.h> 0045 #include <ecma/kjs_proxy.h> 0046 #include <kcharsets.h> 0047 #include <ctype.h> 0048 #include <assert.h> 0049 #include <QVariant> 0050 #include "khtml_debug.h" 0051 #include <stdlib.h> 0052 0053 #include "kentities_p.h" 0054 #include "htmlprospectivetokenizer.h" 0055 0056 #define PROSPECTIVE_TOKENIZER_ENABLED 1 0057 0058 using namespace khtml; 0059 0060 static const QChar commentStart [] = { '<', '!', '-', '-', QChar::Null }; 0061 static const char doctypeStart [] = "<!doctype"; 0062 static const char publicStart [] = "public"; 0063 static const char systemStart [] = "system"; 0064 0065 static const char scriptEnd [] = "</script"; 0066 static const char xmpEnd [] = "</xmp"; 0067 static const char styleEnd [] = "</style"; 0068 static const char textareaEnd [] = "</textarea"; 0069 static const char titleEnd [] = "</title"; 0070 0071 #ifndef NDEBUG 0072 static const int sTokenizerChunkSize = 2048; 0073 static const int sTokenizerFastYieldDelay = 220; 0074 static const int sTokenizerYieldDelay = 650; 0075 #else 0076 static const int sTokenizerChunkSize = 4096; 0077 static const int sTokenizerFastYieldDelay = 180; 0078 static const int sTokenizerYieldDelay = 450; 0079 #endif 0080 0081 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) ) 0082 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N )) 0083 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P )) 0084 0085 // Full support for MS Windows extensions to Latin-1. 0086 // Technically these extensions should only be activated for pages 0087 // marked "windows-1252" or "cp1252", but 0088 // in the standard Microsoft way, these extensions infect hundreds of thousands 0089 // of web pages. Note that people with non-latin-1 Microsoft extensions 0090 // are SOL. 0091 // 0092 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp 0093 // http://www.bbsinc.com/iso8859.html 0094 // http://www.obviously.com/ 0095 // 0096 // There may be better equivalents 0097 #if 0 0098 #define fixUpChar(x) 0099 #else 0100 #define fixUpChar(x) \ 0101 switch ((x).unicode()) \ 0102 { \ 0103 case 0x80: (x) = 0x20ac; break; \ 0104 case 0x82: (x) = 0x201a; break; \ 0105 case 0x83: (x) = 0x0192; break; \ 0106 case 0x84: (x) = 0x201e; break; \ 0107 case 0x85: (x) = 0x2026; break; \ 0108 case 0x86: (x) = 0x2020; break; \ 0109 case 0x87: (x) = 0x2021; break; \ 0110 case 0x88: (x) = 0x02C6; break; \ 0111 case 0x89: (x) = 0x2030; break; \ 0112 case 0x8A: (x) = 0x0160; break; \ 0113 case 0x8b: (x) = 0x2039; break; \ 0114 case 0x8C: (x) = 0x0152; break; \ 0115 case 0x8E: (x) = 0x017D; break; \ 0116 case 0x91: (x) = 0x2018; break; \ 0117 case 0x92: (x) = 0x2019; break; \ 0118 case 0x93: (x) = 0x201C; break; \ 0119 case 0x94: (x) = 0X201D; break; \ 0120 case 0x95: (x) = 0x2022; break; \ 0121 case 0x96: (x) = 0x2013; break; \ 0122 case 0x97: (x) = 0x2014; break; \ 0123 case 0x98: (x) = 0x02DC; break; \ 0124 case 0x99: (x) = 0x2122; break; \ 0125 case 0x9A: (x) = 0x0161; break; \ 0126 case 0x9b: (x) = 0x203A; break; \ 0127 case 0x9C: (x) = 0x0153; break; \ 0128 case 0x9E: (x) = 0x017E; break; \ 0129 case 0x9F: (x) = 0x0178; break; \ 0130 default: break; \ 0131 } 0132 #endif 0133 // ---------------------------------------------------------------------------- 0134 0135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view) 0136 { 0137 view = _view; 0138 buffer = nullptr; 0139 rawContent = nullptr; 0140 rawContentSize = rawContentMaxSize = rawContentResync = rawContentSinceLastEntity = 0; 0141 charsets = KCharsets::charsets(); 0142 parser = new KHTMLParser(_view, _doc); 0143 m_executingScript = 0; 0144 m_externalScriptsTimerId = 0; 0145 m_tokenizerYieldDelay = sTokenizerFastYieldDelay; 0146 m_yieldTimer = 0; 0147 m_prospectiveTokenizer = nullptr; 0148 onHold = false; 0149 m_documentTokenizer = true; 0150 m_hasScriptsWaitingForStylesheets = false; 0151 0152 reset(); 0153 } 0154 0155 HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i) 0156 { 0157 view = nullptr; 0158 buffer = nullptr; 0159 rawContent = nullptr; 0160 rawContentSize = rawContentMaxSize = rawContentResync = rawContentSinceLastEntity = 0; 0161 charsets = KCharsets::charsets(); 0162 parser = new KHTMLParser(i, _doc); 0163 m_executingScript = 0; 0164 m_externalScriptsTimerId = 0; 0165 m_tokenizerYieldDelay = sTokenizerFastYieldDelay; 0166 m_yieldTimer = 0; 0167 m_prospectiveTokenizer = nullptr; 0168 onHold = false; 0169 m_documentTokenizer = false; 0170 m_hasScriptsWaitingForStylesheets = false; 0171 0172 reset(); 0173 } 0174 0175 void HTMLTokenizer::setNormalYieldDelay() 0176 { 0177 m_tokenizerYieldDelay = sTokenizerYieldDelay; 0178 } 0179 0180 void HTMLTokenizer::reset() 0181 { 0182 assert(m_executingScript == 0); 0183 Q_ASSERT(onHold == false); 0184 m_abort = false; 0185 0186 while (!cachedScript.isEmpty()) { 0187 cachedScript.dequeue()->deref(this); 0188 } 0189 0190 if (buffer) { 0191 KHTML_DELETE_QCHAR_VEC(buffer); 0192 } 0193 buffer = dest = nullptr; 0194 size = 0; 0195 0196 if (rawContent) { 0197 KHTML_DELETE_QCHAR_VEC(rawContent); 0198 } 0199 rawContent = nullptr; 0200 rawContentSize = rawContentMaxSize = rawContentResync = 0; 0201 0202 if (m_yieldTimer > 0) { 0203 killTimer(m_yieldTimer); 0204 m_yieldTimer = 0; 0205 } 0206 0207 if (m_externalScriptsTimerId > 0) { 0208 killTimer(m_externalScriptsTimerId); 0209 m_externalScriptsTimerId = 0; 0210 } 0211 currToken.reset(); 0212 doctypeToken.reset(); 0213 javascript = false; 0214 } 0215 0216 void HTMLTokenizer::begin() 0217 { 0218 m_executingScript = 0; 0219 onHold = false; 0220 reset(); 0221 size = 254; 0222 buffer = KHTML_ALLOC_QCHAR_VEC(255); 0223 dest = buffer; 0224 tag = NoTag; 0225 pending = NonePending; 0226 discard = NoneDiscard; 0227 pre = false; 0228 prePos = 0; 0229 plaintext = false; 0230 xmp = false; 0231 processingInstruction = false; 0232 script = false; 0233 escaped = false; 0234 style = false; 0235 skipLF = false; 0236 select = false; 0237 comment = false; 0238 doctype = false; 0239 doctypeComment = NoDoctypeComment; 0240 doctypeAllowComment = false; 0241 server = false; 0242 textarea = false; 0243 title = false; 0244 startTag = false; 0245 tquote = NoQuote; 0246 searchCount = 0; 0247 doctypeSearchCount = 0; 0248 doctypeSecondarySearchCount = 0; 0249 Entity = NoEntity; 0250 noMoreData = false; 0251 brokenComments = false; 0252 brokenServer = false; 0253 lineno = 0; 0254 scriptStartLineno = 0; 0255 tagStartLineno = 0; 0256 } 0257 0258 void HTMLTokenizer::processListing(TokenizerString list) 0259 { 0260 bool old_pre = pre; 0261 0262 // This function adds the listing 'list' as 0263 // preformatted text-tokens to the token-collection 0264 // thereby converting TABs. 0265 if (!style) { 0266 pre = true; 0267 } 0268 prePos = 0; 0269 0270 while (!list.isEmpty()) { 0271 checkBuffer(3 * TAB_SIZE); 0272 0273 if (skipLF && (list->unicode() != '\n')) { 0274 skipLF = false; 0275 } 0276 0277 if (skipLF) { 0278 skipLF = false; 0279 ++list; 0280 } else if ((list->unicode() == '\n') || (list->unicode() == '\r')) { 0281 if (discard == LFDiscard) { 0282 // Ignore this LF 0283 discard = NoneDiscard; // We have discarded 1 LF 0284 } else { 0285 // Process this LF 0286 if (pending) { 0287 addPending(); 0288 } 0289 0290 // we used to do it not at all and we want to have 0291 // it fixed for textarea. So here we are 0292 if (textarea) { 0293 prePos++; 0294 *dest++ = *list; 0295 } else { 0296 pending = LFPending; 0297 } 0298 } 0299 /* Check for MS-DOS CRLF sequence */ 0300 if (list->unicode() == '\r') { 0301 skipLF = true; 0302 } 0303 ++list; 0304 } else if ((list->unicode() == ' ') || (list->unicode() == '\t')) { 0305 if (pending) { 0306 addPending(); 0307 } 0308 if (*list == ' ') { 0309 pending = SpacePending; 0310 } else { 0311 pending = TabPending; 0312 } 0313 0314 ++list; 0315 } else { 0316 discard = NoneDiscard; 0317 if (pending) { 0318 addPending(); 0319 } 0320 0321 prePos++; 0322 *dest++ = *list; 0323 ++list; 0324 } 0325 0326 } 0327 0328 if ((pending == SpacePending) || (pending == TabPending)) { 0329 addPending(); 0330 } else { 0331 pending = NonePending; 0332 } 0333 0334 prePos = 0; 0335 pre = old_pre; 0336 } 0337 0338 void HTMLTokenizer::parseRawContent(TokenizerString &src) 0339 { 0340 // The 'raw content' mode is a very lax tokenizing mode 0341 // that will absorb anything but the exact closing tag 0342 // that made us enter this mode, *except* if it inside a comment. 0343 // 0344 // Any other tag or comment will be passed verbatim to the parser as part 0345 // of the content. It is used for script, style, and a few others. 0346 // 0347 assert(textarea || title || !Entity); 0348 assert(!tag); 0349 assert(xmp + textarea + title + style + script == 1); 0350 if (script) { 0351 scriptStartLineno = lineno + src.lineCount(); 0352 } 0353 0354 if (comment) { 0355 parseComment(src); 0356 } 0357 0358 while (!src.isEmpty()) { 0359 checkRawContentBuffer(); 0360 unsigned char ch = src->toLatin1(); 0361 if (!rawContentResync && !brokenComments && !xmp && ch == '-' && 0362 rawContentSize >= 3 && ((!textarea && !title) || rawContentSinceLastEntity >= 3) && !src.escaped() && 0363 QString::fromRawData(rawContent + rawContentSize - 3, 3) == "<!-") { 0364 comment = true; 0365 rawContent[ rawContentSize++ ] = ch; 0366 ++src; 0367 parseComment(src); 0368 continue; 0369 } 0370 if (rawContentResync && !tquote && (ch == '>')) { 0371 ++src; 0372 rawContentSize = rawContentResync - 1; 0373 rawContentResync = 0; 0374 rawContent[ rawContentSize ] = rawContent[ rawContentSize + 1 ] = 0; 0375 if (script) { 0376 scriptHandler(); 0377 } else { 0378 processListing(TokenizerString(rawContent, rawContentSize)); 0379 processToken(); 0380 if (style) { 0381 currToken.tid = ID_STYLE + ID_CLOSE_TAG; 0382 } else if (textarea) { 0383 currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; 0384 } else if (title) { 0385 currToken.tid = ID_TITLE + ID_CLOSE_TAG; 0386 } else if (xmp) { 0387 currToken.tid = ID_XMP + ID_CLOSE_TAG; 0388 } 0389 processToken(); 0390 script = style = textarea = title = xmp = false; 0391 tquote = NoQuote; 0392 rawContentSize = rawContentResync = 0; 0393 } 0394 return; 0395 } 0396 // possible end of tagname, lets check. 0397 if (!rawContentResync && !escaped && !src.escaped() && (ch == '>' || ch == '/' || ch <= ' ') && ch && 0398 rawContentSize >= searchStopperLen && ((!textarea && !title) || rawContentSinceLastEntity >= searchStopperLen) && 0399 QString::compare(QString::fromRawData(rawContent + rawContentSize - searchStopperLen, searchStopperLen), 0400 QLatin1String(searchStopper), Qt::CaseInsensitive) == 0) { 0401 0402 // the purpose of rawContentResync is to look for an end tag that could possibly be of the form: 0403 // </endtag junk="more junk>\"><>" > 0404 // IOW, once the '</endtag' sequence has been found, the rest of the tag must still be validated, 0405 // so this micro-tokenizer switches to rawContentResync state until '>' is finally found. 0406 rawContentResync = rawContentSize - searchStopperLen + 1; 0407 tquote = NoQuote; 0408 continue; 0409 } 0410 if (rawContentResync && !escaped) { 0411 if (ch == '\"') { 0412 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote); 0413 } else if (ch == '\'') { 0414 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote; 0415 } else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) { 0416 tquote = NoQuote; 0417 } 0418 } 0419 escaped = (!escaped && ch == '\\'); 0420 if (!rawContentResync && (textarea || title) && !src.escaped() && ch == '&') { 0421 QChar *rawContentDest = rawContent + rawContentSize; 0422 ++src; 0423 parseEntity(src, rawContentDest, true); 0424 rawContentSize = rawContentDest - rawContent; 0425 } else { 0426 rawContent[ rawContentSize++ ] = *src; 0427 ++src; 0428 ++rawContentSinceLastEntity; 0429 } 0430 } 0431 } 0432 0433 void HTMLTokenizer::scriptHandler() 0434 { 0435 QString currentScriptSrc = scriptSrc; 0436 scriptSrc.clear(); 0437 0438 processListing(TokenizerString(rawContent, rawContentSize)); 0439 QString exScript(buffer, dest - buffer); 0440 0441 processToken(); 0442 currToken.tid = ID_SCRIPT + ID_CLOSE_TAG; 0443 processToken(); 0444 0445 // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts. 0446 bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET); 0447 bool effectiveScript = !parser->skipMode() && !followingFrameset; 0448 bool deferredScript = false; 0449 0450 if (effectiveScript) { 0451 CachedScript *cs = nullptr; 0452 0453 // forget what we just got, load from src url instead 0454 if (!currentScriptSrc.isEmpty() && javascript) { 0455 const QString completeScriptUrl = parser->doc()->completeURL(currentScriptSrc); 0456 cs = parser->doc()->docLoader()->requestScript(completeScriptUrl, scriptSrcCharset); 0457 } 0458 0459 if (cs) { 0460 cachedScript.enqueue(cs); 0461 pendingQueue.push(src); 0462 int scriptCount = cachedScript.count(); 0463 setSrc(TokenizerString()); 0464 rawContentSize = rawContentResync = 0; 0465 cs->ref(this); 0466 if (cachedScript.count() == scriptCount) { 0467 deferredScript = true; 0468 } 0469 } else if (currentScriptSrc.isNull()/*no src attribute*/ && view && javascript) { 0470 pendingQueue.push(src); 0471 setSrc(TokenizerString()); 0472 rawContentSize = rawContentResync = 0; 0473 scriptExecution(exScript, QString(), tagStartLineno /*scriptStartLineno*/); 0474 } else { 0475 // script was filtered or disallowed 0476 effectiveScript = false; 0477 } 0478 } 0479 0480 script = false; 0481 rawContentSize = rawContentResync = 0; 0482 0483 if (!effectiveScript) { 0484 return; 0485 } 0486 0487 if (!m_executingScript && cachedScript.isEmpty()) { 0488 src.append(pendingQueue.pop()); 0489 } else if (cachedScript.isEmpty()) { 0490 write(pendingQueue.pop(), false); 0491 } else if (!deferredScript && pendingQueue.count() > 1) { 0492 TokenizerString t = pendingQueue.pop(); 0493 pendingQueue.top().prepend(t); 0494 } 0495 #if PROSPECTIVE_TOKENIZER_ENABLED 0496 if (!cachedScript.isEmpty() && !m_executingScript) { 0497 if (!m_prospectiveTokenizer) { 0498 m_prospectiveTokenizer = new ProspectiveTokenizer(parser->docPtr()); 0499 } 0500 if (!m_prospectiveTokenizer->inProgress() && !pendingQueue.isEmpty()) { 0501 m_prospectiveTokenizer->begin(); 0502 m_prospectiveTokenizer->write(pendingQueue.top()); 0503 } 0504 } 0505 #endif 0506 0507 } 0508 0509 void HTMLTokenizer::scriptExecution(const QString &str, const QString &scriptURL, 0510 int baseLine) 0511 { 0512 bool oldscript = script; 0513 m_executingScript++; 0514 script = false; 0515 QString url; 0516 if (scriptURL.isNull() && view) { 0517 url = static_cast<DocumentImpl *>(view->part()->document().handle())->URL().url(); 0518 } else { 0519 url = scriptURL; 0520 } 0521 0522 if (view) { 0523 view->part()->executeScript(url, baseLine, Node(), str); 0524 } 0525 m_executingScript--; 0526 script = oldscript; 0527 } 0528 0529 void HTMLTokenizer::parseComment(TokenizerString &src) 0530 { 0531 checkRawContentBuffer(src.length()); 0532 while (src.length()) { 0533 rawContent[ rawContentSize++ ] = *src; 0534 0535 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 0536 qDebug("comment is now: *%s*", src.toString().left(16).toLatin1().constData()); 0537 #endif 0538 0539 if (src->unicode() == '>') { 0540 bool handleBrokenComments = brokenComments && !(script || style); 0541 bool scriptEnd = false; 0542 if (rawContentSize > 2 && rawContent[rawContentSize - 3] == '-' && 0543 rawContent[rawContentSize - 2] == '-') { 0544 scriptEnd = true; 0545 } 0546 0547 if (handleBrokenComments || scriptEnd) { 0548 ++src; 0549 if (!(title || script || xmp || textarea || style)) { 0550 checkRawContentBuffer(); 0551 rawContent[ rawContentSize ] = 0; 0552 rawContent[ rawContentSize + 1 ] = 0; 0553 currToken.tid = ID_COMMENT; 0554 int size = scriptEnd ? rawContentSize - 3 : rawContentSize - 1; 0555 processListing(TokenizerString(rawContent, size)); 0556 processToken(); 0557 currToken.tid = ID_COMMENT + ID_CLOSE_TAG; 0558 processToken(); 0559 rawContentSize = 0; 0560 } 0561 comment = false; 0562 return; // Finished parsing comment 0563 } 0564 } 0565 ++src; 0566 } 0567 } 0568 0569 void HTMLTokenizer::parseDoctypeComment(TokenizerString &src) 0570 { 0571 while (!src.isEmpty()) { 0572 QChar c = *src; 0573 switch (doctypeComment) { 0574 case DoctypeCommentHalfBegin: { 0575 if (c != '-') { 0576 // Ooops, it's not comment 0577 doctypeComment = DoctypeCommentBogus; 0578 return; 0579 } else { 0580 // Doctype comment begins 0581 doctypeComment = DoctypeComment; 0582 ++src; 0583 } 0584 break; 0585 } 0586 case DoctypeComment: { 0587 if (c == '-') { 0588 // Perhaps this is end of comment 0589 doctypeComment = DoctypeCommentHalfEnd; 0590 ++src; 0591 } else { 0592 // Keep scanning for '--' 0593 ++src; 0594 } 0595 break; 0596 } 0597 case DoctypeCommentHalfEnd: { 0598 if (c == '-') { 0599 // Doctype comment ends 0600 doctypeComment = DoctypeCommentEnd; 0601 return; 0602 } else { 0603 // It's not '--' 0604 ++src; 0605 doctypeComment = DoctypeComment; 0606 } 0607 break; 0608 } 0609 default: { 0610 assert(!"Undefined doctype comment state"); 0611 break; 0612 } 0613 } 0614 } 0615 } 0616 0617 void HTMLTokenizer::parseDoctype(TokenizerString &src) 0618 { 0619 while (!src.isEmpty() && doctype) { 0620 QChar c; 0621 bool isWhitespace = false; 0622 int dontAdvance = 0; 0623 if (doctypeComment == DoctypeCommentEnd) { 0624 doctypeComment = NoDoctypeComment; 0625 isWhitespace = true; 0626 } else if (doctypeComment == DoctypeCommentBogus) { 0627 doctypeComment = NoDoctypeComment; 0628 c = '-'; 0629 dontAdvance++; 0630 } else { 0631 c = *src; 0632 if (doctypeAllowComment) { 0633 if (!doctypeComment && c == '-') { 0634 doctypeComment = DoctypeCommentHalfBegin; 0635 ++src; 0636 } 0637 if (doctypeComment) { 0638 parseDoctypeComment(src); 0639 continue; 0640 } 0641 isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' '; 0642 } 0643 } 0644 0645 switch (doctypeToken.state) { 0646 case DoctypeBegin: { 0647 doctypeToken.state = DoctypeBeforeName; 0648 if (isWhitespace) { 0649 // nothing 0650 } 0651 break; 0652 } 0653 case DoctypeBeforeName: { 0654 if (c == '>') { 0655 // Malformed. Just exit. 0656 doctype = false; 0657 } else if (isWhitespace) { 0658 // nothing 0659 } else { 0660 dontAdvance++; 0661 doctypeToken.state = DoctypeName; 0662 } 0663 break; 0664 } 0665 case DoctypeName: { 0666 if (c == '>') { 0667 // Valid doctype. Emit it. 0668 doctype = false; 0669 processDoctypeToken(); 0670 } else if (isWhitespace) { 0671 doctypeSearchCount = 0; // Used now to scan for PUBLIC 0672 doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM 0673 doctypeToken.state = DoctypeAfterName; 0674 } else { 0675 doctypeToken.name.append(c); 0676 } 0677 break; 0678 } 0679 case DoctypeAfterName: { 0680 if (c == '>') { 0681 // Valid doctype. Emit it. 0682 doctype = false; 0683 processDoctypeToken(); 0684 } else if (c == '[') { 0685 if (doctypeSearchCount > 0 || doctypeSecondarySearchCount > 0) { // is there any public/system indicator before? 0686 doctypeSearchCount = doctypeSecondarySearchCount = 0; 0687 doctypeToken.state = DoctypeBogus; 0688 } 0689 // Found internal subset 0690 doctypeToken.state = DoctypeInternalSubset; 0691 doctypeAllowComment = false; 0692 } else if (!isWhitespace) { 0693 if (c.toLower() == publicStart[doctypeSearchCount]) { 0694 doctypeSearchCount++; 0695 if (doctypeSearchCount == 6) 0696 // Found 'PUBLIC' sequence 0697 { 0698 doctypeToken.state = DoctypeBeforePublicID; 0699 } 0700 } else if (doctypeSearchCount > 0) { 0701 doctypeSearchCount = 0; 0702 doctypeToken.state = DoctypeBogus; 0703 } else if (c.toLower() == systemStart[doctypeSecondarySearchCount]) { 0704 doctypeSecondarySearchCount++; 0705 if (doctypeSecondarySearchCount == 6) 0706 // Found 'SYSTEM' sequence 0707 { 0708 doctypeToken.state = DoctypeBeforeSystemID; 0709 } 0710 } else { 0711 doctypeSecondarySearchCount = 0; 0712 doctypeToken.state = DoctypeBogus; 0713 } 0714 } else { 0715 // Whitespace keeps us in the after name state 0716 } 0717 break; 0718 } 0719 case DoctypeBeforePublicID: { 0720 if (c == '\"' || c == '\'') { 0721 tquote = c == '\"' ? DoubleQuote : SingleQuote; 0722 doctypeToken.state = DoctypePublicID; 0723 doctypeAllowComment = false; 0724 } else if (c == '>') { 0725 // Considered bogus. Don't process the doctype. 0726 doctype = false; 0727 } else if (isWhitespace) { 0728 // nothing 0729 } else { 0730 doctypeToken.state = DoctypeBogus; 0731 } 0732 break; 0733 } 0734 case DoctypePublicID: { 0735 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { 0736 doctypeToken.state = DoctypeAfterPublicID; 0737 doctypeAllowComment = true; 0738 } else if (c == '>') { 0739 // Considered bogus. Don't process the doctype. 0740 doctype = false; 0741 } else { 0742 doctypeToken.publicID.append(c); 0743 } 0744 break; 0745 } 0746 case DoctypeAfterPublicID: { 0747 if (c == '\"' || c == '\'') { 0748 tquote = c == '\"' ? DoubleQuote : SingleQuote; 0749 doctypeToken.state = DoctypeSystemID; 0750 } else if (c == '>') { 0751 // Valid doctype. Emit it now. 0752 doctype = false; 0753 processDoctypeToken(); 0754 } else if (isWhitespace) { 0755 // nothing 0756 } else if (c == '[') { 0757 // Found internal subset 0758 doctypeToken.state = DoctypeInternalSubset; 0759 doctypeAllowComment = false; 0760 } else { 0761 doctypeToken.state = DoctypeBogus; 0762 } 0763 break; 0764 } 0765 case DoctypeBeforeSystemID: { 0766 if (c == '\"' || c == '\'') { 0767 tquote = c == '\"' ? DoubleQuote : SingleQuote; 0768 doctypeToken.state = DoctypeSystemID; 0769 doctypeAllowComment = false; 0770 } else if (c == '>') { 0771 // Considered bogus. Don't process the doctype. 0772 doctype = false; 0773 } else if (isWhitespace) { 0774 // nothing 0775 } else { 0776 doctypeToken.state = DoctypeBogus; 0777 } 0778 break; 0779 } 0780 case DoctypeSystemID: { 0781 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { 0782 doctypeToken.state = DoctypeAfterSystemID; 0783 doctypeAllowComment = true; 0784 } else if (c == '>') { 0785 // Considered bogus. Don't process the doctype. 0786 doctype = false; 0787 } else { 0788 doctypeToken.systemID.append(c); 0789 } 0790 break; 0791 } 0792 case DoctypeAfterSystemID: { 0793 if (c == '>') { 0794 // Valid doctype. Emit it now. 0795 doctype = false; 0796 processDoctypeToken(); 0797 } else if (isWhitespace) { 0798 // nothing 0799 } else if (c == '[') { 0800 // Found internal subset 0801 doctypeToken.state = DoctypeInternalSubset; 0802 doctypeAllowComment = false; 0803 } else { 0804 doctypeToken.state = DoctypeBogus; 0805 } 0806 break; 0807 } 0808 case DoctypeInternalSubset: { 0809 if (c == ']') { 0810 // Done 0811 doctypeToken.state = DoctypeAfterInternalSubset; 0812 doctypeAllowComment = true; 0813 } else { 0814 doctypeToken.internalSubset.append(c); 0815 } 0816 break; 0817 } 0818 case DoctypeAfterInternalSubset: { 0819 if (c == '>') { 0820 // Valid doctype. Emit it now. 0821 doctype = false; 0822 processDoctypeToken(); 0823 } else if (isWhitespace) { 0824 // nothing 0825 } else { 0826 doctypeToken.state = DoctypeBogus; 0827 } 0828 break; 0829 } 0830 case DoctypeBogus: { 0831 if (c == '>') { 0832 // Done with the bogus doctype. 0833 doctype = false; 0834 } else { 0835 // Just keep scanning for '>' 0836 } 0837 break; 0838 } 0839 default: 0840 break; 0841 } 0842 if (!dontAdvance) { 0843 ++src; 0844 } else if (dontAdvance == 1) { 0845 continue; 0846 } else { // double dontAdvance++, do workaround 0847 doctypeComment = DoctypeCommentBogus; 0848 } 0849 } 0850 } 0851 0852 void HTMLTokenizer::parseServer(TokenizerString &src) 0853 { 0854 checkRawContentBuffer(src.length()); 0855 while (!src.isEmpty()) { 0856 rawContent[ rawContentSize++ ] = *src; 0857 if (src->unicode() == '>' && 0858 rawContentSize > 1 && rawContent[rawContentSize - 2] == '%') { 0859 ++src; 0860 server = false; 0861 rawContentSize = 0; 0862 return; // Finished parsing server include 0863 } 0864 ++src; 0865 } 0866 } 0867 0868 void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src) 0869 { 0870 char oldchar = 0; 0871 while (!src.isEmpty()) { 0872 unsigned char chbegin = src->toLatin1(); 0873 if (chbegin == '\'') { 0874 tquote = tquote == SingleQuote ? NoQuote : SingleQuote; 0875 } else if (chbegin == '\"') { 0876 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; 0877 } 0878 // Look for '?>' 0879 // some crappy sites omit the "?" before it, so 0880 // we look for an unquoted '>' instead. (IE compatible) 0881 else if (chbegin == '>' && (!tquote || oldchar == '?')) { 0882 // We got a '?>' sequence 0883 processingInstruction = false; 0884 ++src; 0885 discard = LFDiscard; 0886 return; // Finished parsing comment! 0887 } 0888 ++src; 0889 oldchar = chbegin; 0890 } 0891 } 0892 0893 void HTMLTokenizer::parseText(TokenizerString &src) 0894 { 0895 while (!src.isEmpty()) { 0896 // do we need to enlarge the buffer? 0897 checkBuffer(); 0898 0899 // ascii is okay because we only do ascii comparisons 0900 unsigned char chbegin = src->toLatin1(); 0901 0902 if (skipLF && (chbegin != '\n')) { 0903 skipLF = false; 0904 } 0905 0906 if (skipLF) { 0907 skipLF = false; 0908 ++src; 0909 } else if ((chbegin == '\n') || (chbegin == '\r')) { 0910 if (chbegin == '\r') { 0911 skipLF = true; 0912 } 0913 0914 *dest++ = '\n'; 0915 ++src; 0916 } else { 0917 *dest++ = *src; 0918 ++src; 0919 } 0920 } 0921 } 0922 0923 void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start) 0924 { 0925 if (start) { 0926 cBufferPos = 0; 0927 entityLen = 0; 0928 Entity = SearchEntity; 0929 } 0930 0931 while (!src.isEmpty()) { 0932 ushort cc = src->unicode(); 0933 switch (Entity) { 0934 case NoEntity: 0935 return; 0936 0937 break; 0938 case SearchEntity: 0939 if (cc == '#') { 0940 cBuffer[cBufferPos++] = cc; 0941 ++src; 0942 Entity = NumericSearch; 0943 } else { 0944 Entity = EntityName; 0945 } 0946 0947 break; 0948 0949 case NumericSearch: 0950 if (cc == 'x' || cc == 'X') { 0951 cBuffer[cBufferPos++] = cc; 0952 ++src; 0953 Entity = Hexadecimal; 0954 } else if (cc >= '0' && cc <= '9') { 0955 Entity = Decimal; 0956 } else { 0957 Entity = SearchSemicolon; 0958 } 0959 0960 break; 0961 0962 case Hexadecimal: { 0963 int uc = EntityChar.unicode(); 0964 int ll = qMin<uint>(src.length(), 8); 0965 while (ll--) { 0966 QChar csrc(src->toLower()); 0967 cc = csrc.cell(); 0968 0969 if (csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) { 0970 break; 0971 } 0972 uc = uc * 16 + (cc - (cc < 'a' ? '0' : 'a' - 10)); 0973 cBuffer[cBufferPos++] = cc; 0974 ++src; 0975 } 0976 EntityChar = QChar(uc); 0977 Entity = SearchSemicolon; 0978 break; 0979 } 0980 case Decimal: { 0981 int uc = EntityChar.unicode(); 0982 int ll = qMin(src.length(), 9 - cBufferPos); 0983 while (ll--) { 0984 cc = src->cell(); 0985 0986 if (src->row() || !(cc >= '0' && cc <= '9')) { 0987 Entity = SearchSemicolon; 0988 break; 0989 } 0990 0991 uc = uc * 10 + (cc - '0'); 0992 cBuffer[cBufferPos++] = cc; 0993 ++src; 0994 } 0995 EntityChar = QChar(uc); 0996 if (cBufferPos == 9) { 0997 Entity = SearchSemicolon; 0998 } 0999 break; 1000 } 1001 case EntityName: { 1002 int ll = qMin(src.length(), 9 - cBufferPos); 1003 while (ll--) { 1004 QChar csrc = *src; 1005 cc = csrc.cell(); 1006 1007 if (csrc.row() || !((cc >= 'a' && cc <= 'z') || 1008 (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { 1009 Entity = SearchSemicolon; 1010 break; 1011 } 1012 1013 cBuffer[cBufferPos++] = cc; 1014 ++src; 1015 1016 // be IE compatible and interpret even unterminated entities 1017 // outside tags. like "foo  stuff bla". 1018 if (tag == NoTag) { 1019 int code; 1020 const bool found = kde_findEntity(cBuffer, cBufferPos, &code); 1021 if (found && code < 256) { 1022 EntityChar = code; 1023 entityLen = cBufferPos; 1024 } 1025 } 1026 } 1027 if (cBufferPos == 9) { 1028 Entity = SearchSemicolon; 1029 } 1030 if (Entity == SearchSemicolon) { 1031 if (cBufferPos > 1) { 1032 int code; 1033 const bool found = kde_findEntity(cBuffer, cBufferPos, &code); 1034 // IE only accepts unterminated entities < 256, 1035 // Gecko accepts them all, but only outside tags 1036 if (found && (tag == NoTag || code < 256 || *src == ';')) { 1037 EntityChar = code; 1038 entityLen = cBufferPos; 1039 } 1040 } 1041 } 1042 break; 1043 } 1044 case SearchSemicolon: 1045 #ifdef TOKEN_DEBUG 1046 qCDebug(KHTML_LOG) << "ENTITY " << EntityChar.unicode(); 1047 #endif 1048 fixUpChar(EntityChar); 1049 1050 if (*src == ';') { 1051 ++src; 1052 } 1053 1054 if (!EntityChar.isNull()) { 1055 checkBuffer(); 1056 if (entityLen > 0 && entityLen < cBufferPos) { 1057 int rem = cBufferPos - entityLen; 1058 src.prepend(TokenizerString(QString::fromLatin1(cBuffer + entityLen, rem))); 1059 } 1060 src.push(EntityChar); 1061 rawContentSinceLastEntity = -1; 1062 } else { 1063 #ifdef TOKEN_DEBUG 1064 qCDebug(KHTML_LOG) << "unknown entity!"; 1065 #endif 1066 checkBuffer(11); 1067 // ignore the sequence, add it to the buffer as plaintext 1068 *dest++ = '&'; 1069 for (unsigned int i = 0; i < cBufferPos; i++) { 1070 dest[i] = cBuffer[i]; 1071 } 1072 dest += cBufferPos; 1073 rawContentSinceLastEntity += cBufferPos + 1; 1074 if (pre) { 1075 prePos += cBufferPos + 1; 1076 } 1077 } 1078 1079 Entity = NoEntity; 1080 EntityChar = QChar::Null; 1081 return; 1082 }; 1083 } 1084 } 1085 1086 void HTMLTokenizer::parseTag(TokenizerString &src) 1087 { 1088 assert(!Entity); 1089 checkRawContentBuffer(src.length()); 1090 1091 while (!src.isEmpty()) { 1092 checkBuffer(); 1093 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1094 uint l = 0; 1095 while (l < src.length() && (src.toString()[l]).toLatin1() != '>') { 1096 l++; 1097 } 1098 qDebug("src is now: *%s*, tquote: %d", src.toString().left(l).toLatin1().constData(), tquote); 1099 #endif 1100 switch (tag) { 1101 case NoTag: 1102 return; 1103 case TagName: { 1104 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1105 qDebug("TagName"); 1106 #endif 1107 if (searchCount > 0) { 1108 if (*src == commentStart[searchCount]) { 1109 searchCount++; 1110 if (searchCount == 2) { 1111 doctypeSearchCount++; // A '!' is also part of doctype, so we are moving through that still as well 1112 } else { 1113 doctypeSearchCount = 0; 1114 } 1115 1116 if (searchCount == 4) { 1117 #ifdef TOKEN_DEBUG 1118 qCDebug(KHTML_LOG) << "Found comment"; 1119 #endif 1120 // Found '<!--' sequence 1121 ++src; 1122 dest = buffer; // ignore the previous part of this tag 1123 tag = NoTag; 1124 1125 comment = true; 1126 parseComment(src); 1127 return; // Finished parsing tag! 1128 } 1129 // cuts of high part, is okay 1130 cBuffer[cBufferPos++] = src->cell(); 1131 ++src; 1132 break; 1133 } else { 1134 searchCount = 0; // Stop looking for '<!--' sequence 1135 } 1136 } 1137 1138 if (doctypeSearchCount > 0) { 1139 if ((*src).toLower() == doctypeStart[doctypeSearchCount]) { 1140 doctypeSearchCount++; 1141 cBuffer[cBufferPos++] = src->cell(); 1142 ++src; 1143 if (doctypeSearchCount == 9) { 1144 // Found '<!DOCTYPE' sequence 1145 tag = NoTag; 1146 doctypeAllowComment = true; 1147 doctypeComment = NoDoctypeComment; 1148 doctypeToken.reset(); 1149 doctype = true; 1150 1151 parseDoctype(src); 1152 return; 1153 } 1154 break; 1155 } else { 1156 doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence 1157 } 1158 } 1159 1160 bool finish = false; 1161 unsigned int ll = qMin(src.length(), CBUFLEN - cBufferPos); 1162 while (ll--) { 1163 ushort curchar = src->unicode(); 1164 if (curchar <= ' ' || curchar == '>') { 1165 finish = true; 1166 break; 1167 } 1168 // this is a nasty performance trick. will work for the A-Z 1169 // characters, but not for others. if it contains one, 1170 // we fail anyway 1171 char cc = curchar; 1172 cBuffer[cBufferPos++] = cc | 0x20; 1173 ++src; 1174 } 1175 1176 // Disadvantage: we add the possible rest of the tag 1177 // as attribute names. ### judge if this causes problems 1178 if (finish || CBUFLEN == cBufferPos) { 1179 bool beginTag; 1180 char *ptr = cBuffer; 1181 unsigned int len = cBufferPos; 1182 cBuffer[cBufferPos] = '\0'; 1183 if ((cBufferPos > 0) && (*ptr == '/')) { 1184 // End Tag 1185 beginTag = false; 1186 ptr++; 1187 len--; 1188 } else 1189 // Start Tag 1190 { 1191 beginTag = true; 1192 } 1193 // Accept empty xml tags like <br/> 1194 if (len > 1 && ptr[len - 1] == '/') { 1195 ptr[--len] = '\0'; 1196 // if it is like <br/> and not like <input/ value=foo>, take it as flat 1197 if (*src == '>') { 1198 currToken.flat = true; 1199 } 1200 } 1201 1202 uint tagID = 0; 1203 if (!tagID) { 1204 DOMString tagName(ptr); 1205 if (Element::khtmlValidQualifiedName(tagName)) { 1206 safeLocalName = LocalName::fromString(tagName, IDS_NormalizeLower); 1207 tagID = safeLocalName.id(); 1208 } 1209 #ifdef TOKEN_DEBUG 1210 QByteArray tmp(ptr, len + 1); 1211 qCDebug(KHTML_LOG) << "Unknown tag: \"" << tmp.data() << "\""; 1212 #endif 1213 } 1214 if (tagID) { 1215 #ifdef TOKEN_DEBUG 1216 QByteArray tmp(ptr, len + 1); 1217 qCDebug(KHTML_LOG) << "found tag id=" << tagID << ": " << tmp.data(); 1218 #endif 1219 currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG; 1220 } 1221 dest = buffer; 1222 tag = SearchAttribute; 1223 cBufferPos = 0; 1224 } 1225 break; 1226 } 1227 case SearchAttribute: { 1228 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1229 qDebug("SearchAttribute"); 1230 #endif 1231 bool atespace = false; 1232 ushort curchar; 1233 while (!src.isEmpty()) { 1234 curchar = src->unicode(); 1235 if (curchar > ' ') { 1236 if (curchar == '<' || curchar == '>') { 1237 tag = SearchEnd; 1238 } else if (atespace && (curchar == '\'' || curchar == '"')) { 1239 tag = SearchValue; 1240 *dest++ = 0; 1241 attrName = DOMString(""); 1242 } else { 1243 tag = AttributeName; 1244 } 1245 1246 cBufferPos = 0; 1247 break; 1248 } 1249 atespace = true; 1250 ++src; 1251 } 1252 break; 1253 } 1254 case AttributeName: { 1255 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1256 qDebug("AttributeName"); 1257 #endif 1258 ushort curchar; 1259 int ll = qMin(src.length(), CBUFLEN - cBufferPos); 1260 1261 while (ll--) { 1262 curchar = src->unicode(); 1263 if (curchar <= '>') { 1264 if (curchar <= ' ' || curchar == '=' || curchar == '>') { 1265 unsigned int a; 1266 cBuffer[cBufferPos] = '\0'; 1267 a = LocalName::fromString(DOMString(cBuffer), IDS_NormalizeLower).id(); // ### still deep copy? 1268 if (a > ATTR_LAST_ATTR) { 1269 a = 0; 1270 } 1271 1272 if (!a) { 1273 // did we just get /> or e.g checked/> 1274 if (curchar == '>' && cBufferPos >= 1 && cBuffer[cBufferPos - 1] == '/') { 1275 currToken.flat = true; 1276 cBuffer[cBufferPos - 1] = '\0'; 1277 if (cBufferPos > 1) { 1278 a = LocalName::fromString(DOMString(cBuffer), IDS_NormalizeLower).id(); 1279 } 1280 if (a > ATTR_LAST_ATTR) { 1281 a = 0; 1282 } 1283 cBuffer[cBufferPos - 1] = '/'; 1284 } 1285 if (!a) { 1286 attrName = DOMString(cBuffer, cBufferPos); 1287 } 1288 } 1289 1290 dest = buffer; 1291 *dest++ = a; 1292 #ifdef TOKEN_DEBUG 1293 if (!a || (cBufferPos && *cBuffer == '!')) { 1294 qCDebug(KHTML_LOG) << "Unknown attribute: *" << QByteArray(cBuffer, cBufferPos + 1).data() << "*"; 1295 } else { 1296 qCDebug(KHTML_LOG) << "Known attribute: " << QByteArray(cBuffer, cBufferPos + 1).data(); 1297 } 1298 #endif 1299 1300 tag = SearchEqual; 1301 break; 1302 } 1303 } 1304 cBuffer[cBufferPos++] = 1305 (curchar >= 'A' && curchar <= 'Z') ? curchar | 0x20 : curchar; 1306 ++src; 1307 } 1308 if (cBufferPos == CBUFLEN) { 1309 cBuffer[cBufferPos] = '\0'; 1310 attrName = DOMString(cBuffer, cBufferPos); 1311 dest = buffer; 1312 *dest++ = 0; 1313 tag = SearchEqual; 1314 } 1315 break; 1316 } 1317 case SearchEqual: { 1318 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1319 qDebug("SearchEqual"); 1320 #endif 1321 ushort curchar; 1322 bool atespace = false; 1323 while (!src.isEmpty()) { 1324 curchar = src->unicode(); 1325 if (curchar > ' ') { 1326 if (curchar == '=') { 1327 #ifdef TOKEN_DEBUG 1328 qCDebug(KHTML_LOG) << "found equal"; 1329 #endif 1330 tag = SearchValue; 1331 ++src; 1332 } else if (atespace && (curchar == '\'' || curchar == '"')) { 1333 tag = SearchValue; 1334 *dest++ = 0; 1335 attrName = DOMString(""); 1336 } else { 1337 DOMString v(""); 1338 currToken.addAttribute(parser->docPtr(), buffer, attrName, v); 1339 dest = buffer; 1340 tag = SearchAttribute; 1341 } 1342 break; 1343 } 1344 atespace = true; 1345 ++src; 1346 } 1347 break; 1348 } 1349 case SearchValue: { 1350 ushort curchar; 1351 while (!src.isEmpty()) { 1352 curchar = src->unicode(); 1353 if (curchar > ' ') { 1354 if ((curchar == '\'' || curchar == '\"')) { 1355 tquote = curchar == '\"' ? DoubleQuote : SingleQuote; 1356 tag = QuotedValue; 1357 ++src; 1358 } else { 1359 tag = Value; 1360 } 1361 1362 break; 1363 } 1364 ++src; 1365 } 1366 break; 1367 } 1368 case QuotedValue: { 1369 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1370 qDebug("QuotedValue"); 1371 #endif 1372 ushort curchar; 1373 while (!src.isEmpty()) { 1374 checkBuffer(); 1375 1376 curchar = src->unicode(); 1377 if (curchar <= '\'' && !src.escaped()) { 1378 // ### attributes like '&{blaa....};' are supposed to be treated as jscript. 1379 if (curchar == '&') { 1380 ++src; 1381 parseEntity(src, dest, true); 1382 break; 1383 } else if ((tquote == SingleQuote && curchar == '\'') || 1384 (tquote == DoubleQuote && curchar == '\"')) { 1385 // some <input type=hidden> rely on trailing spaces. argh 1386 while (dest > buffer + 1 && (*(dest - 1) == '\n' || *(dest - 1) == '\r')) { 1387 dest--; // remove trailing newlines 1388 } 1389 DOMString v(buffer + 1, dest - buffer - 1); 1390 currToken.addAttribute(parser->docPtr(), buffer, attrName, v); 1391 1392 dest = buffer; 1393 tag = SearchAttribute; 1394 tquote = NoQuote; 1395 ++src; 1396 break; 1397 } 1398 } 1399 *dest++ = *src; 1400 ++src; 1401 } 1402 break; 1403 } 1404 case Value: { 1405 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1406 qDebug("Value"); 1407 #endif 1408 ushort curchar; 1409 while (!src.isEmpty()) { 1410 checkBuffer(); 1411 curchar = src->unicode(); 1412 if (curchar <= '>' && !src.escaped()) { 1413 // parse Entities 1414 if (curchar == '&') { 1415 ++src; 1416 parseEntity(src, dest, true); 1417 break; 1418 } 1419 // no quotes. Every space means end of value 1420 // '/' does not delimit in IE! 1421 // HTML5: must not contain any literal space characters, any U+0022 QUOTATION MARK (") characters, 1422 // U+0027 APOSTROPHE (') characters, U+003D EQUALS SIGN (=) characters, U+003C LESS-THAN SIGN (<) characters, 1423 // U+003E GREATER-THAN SIGN (>) characters, or U+0060 GRAVE ACCENT (`) characters, and must not be the empty string. 1424 // Real life: images.google.com uses URLs including form arguments (foo=bar) 1425 // in unquoted parameters --- with an html5 <!doctype html> DTD. 1426 // Real life takes priority, so we accept at least = 1427 if (curchar <= ' ' || curchar == '>' || curchar == '\'' || curchar == '"' || curchar == '<' || /*curchar == '=' ||*/ curchar == '`') { 1428 DOMString v(buffer + 1, dest - buffer - 1); 1429 currToken.addAttribute(parser->docPtr(), buffer, attrName, v); 1430 dest = buffer; 1431 tag = SearchAttribute; 1432 break; 1433 } 1434 } 1435 1436 *dest++ = *src; 1437 ++src; 1438 } 1439 break; 1440 } 1441 case SearchEnd: { 1442 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 1443 qDebug("SearchEnd"); 1444 #endif 1445 while (!src.isEmpty()) { 1446 if (*src == '<' || *src == '>') { 1447 break; 1448 } 1449 1450 if (*src == '/') { 1451 currToken.flat = true; 1452 } 1453 1454 ++src; 1455 } 1456 if (src.isEmpty() && *src != '<' && *src != '>') { 1457 break; 1458 } 1459 1460 searchCount = 0; // Stop looking for '<!--' sequence 1461 tag = NoTag; 1462 tquote = NoQuote; 1463 if (*src == '>') { 1464 ++src; 1465 } 1466 1467 if (!currToken.tid) { //stop if tag is unknown 1468 return; 1469 } 1470 1471 uint tagID = currToken.tid; 1472 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0 1473 qCDebug(KHTML_LOG) << "appending Tag: " << tagID; 1474 #endif 1475 // When parsing HTML flat tags like <div /> should 1476 // be ignored, the only exception is SCRIPT, and 1477 // tags with forbidden end-tags 1478 if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT && 1479 DOM::endTagRequirement(tagID) != DOM::FORBIDDEN && 1480 parser->doc()->htmlMode() != DocumentImpl::XHtml) { 1481 currToken.flat = false; 1482 } 1483 1484 bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG); 1485 HTMLScriptElementImpl *prevScriptElem = nullptr; 1486 1487 if (tagID >= ID_CLOSE_TAG) { 1488 tagID -= ID_CLOSE_TAG; 1489 } else if (tagID == ID_SCRIPT) { 1490 prevScriptElem = parser->currentScriptElement(); 1491 DOMStringImpl *a = nullptr; 1492 scriptSrc.clear(); scriptSrcCharset.clear(); 1493 if (currToken.attrs && /* potentially have a ATTR_SRC ? */ 1494 view && /* are we a regular tokenizer or just for innerHTML ? */ 1495 parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */ 1496 ) { 1497 if ((a = currToken.attrs->getValue(ATTR_SRC))) { 1498 scriptSrc = DOMString(a).trimSpaces().string(); 1499 } 1500 if ((a = currToken.attrs->getValue(ATTR_CHARSET))) { 1501 scriptSrcCharset = DOMString(a).string().trimmed(); 1502 } 1503 if (scriptSrcCharset.isEmpty() && view) { 1504 scriptSrcCharset = parser->doc()->view()->part()->encoding(); 1505 } 1506 } 1507 javascript = true; 1508 } 1509 1510 processToken(); 1511 1512 if (javascript) { 1513 HTMLScriptElementImpl *sc = parser->currentScriptElement(); 1514 javascript = (sc && sc != prevScriptElem) ? sc->isValidScript() : false; 1515 } 1516 1517 if (parser->selectMode() && beginTag) { 1518 discard = AllDiscard; 1519 } 1520 1521 switch (tagID) { 1522 case ID_LISTING: 1523 case ID_PRE: 1524 pre = beginTag; 1525 if (beginTag) { 1526 discard = LFDiscard; 1527 } 1528 prePos = 0; 1529 break; 1530 case ID_BR: 1531 prePos = 0; 1532 break; 1533 case ID_SCRIPT: 1534 if (beginTag) { 1535 searchStopper = scriptEnd; 1536 searchStopperLen = 8; 1537 script = true; 1538 parseRawContent(src); 1539 } else if (tagID < ID_CLOSE_TAG) { // Handle <script src="foo"/> 1540 script = true; 1541 scriptHandler(); 1542 } 1543 break; 1544 case ID_STYLE: 1545 if (beginTag) { 1546 searchStopper = styleEnd; 1547 searchStopperLen = 7; 1548 style = true; 1549 parseRawContent(src); 1550 } 1551 break; 1552 case ID_TEXTAREA: 1553 if (beginTag) { 1554 searchStopper = textareaEnd; 1555 searchStopperLen = 10; 1556 textarea = true; 1557 discard = NoneDiscard; 1558 rawContentSinceLastEntity = 0; 1559 parseRawContent(src); 1560 } 1561 break; 1562 case ID_TITLE: 1563 if (beginTag) { 1564 searchStopper = titleEnd; 1565 searchStopperLen = 7; 1566 title = true; 1567 rawContentSinceLastEntity = 0; 1568 parseRawContent(src); 1569 } 1570 break; 1571 case ID_XMP: 1572 if (beginTag) { 1573 searchStopper = xmpEnd; 1574 searchStopperLen = 5; 1575 xmp = true; 1576 parseRawContent(src); 1577 } 1578 break; 1579 case ID_SELECT: 1580 select = beginTag; 1581 break; 1582 case ID_PLAINTEXT: 1583 plaintext = beginTag; 1584 break; 1585 } 1586 return; // Finished parsing tag! 1587 } 1588 } // end switch 1589 } 1590 return; 1591 } 1592 1593 void HTMLTokenizer::addPending() 1594 { 1595 if (select && !(comment || script)) { 1596 *dest++ = ' '; 1597 } else { 1598 switch (pending) { 1599 case LFPending: *dest++ = QLatin1Char('\n'); prePos = 0; break; 1600 case SpacePending: *dest++ = QLatin1Char(' '); ++prePos; break; 1601 case TabPending: { 1602 // Don't expand tabs inside <textarea> or script 1603 int p = TAB_SIZE - (prePos % TAB_SIZE); 1604 if (textarea || script) { 1605 *dest++ = QLatin1Char('\t'); 1606 } else { 1607 for (int x = 0; x < p; x++) { 1608 *dest++ = QLatin1Char(' '); 1609 } 1610 } 1611 prePos += p; 1612 break; 1613 } 1614 case NonePending: 1615 assert(0); 1616 } 1617 } 1618 1619 pending = NonePending; 1620 } 1621 1622 inline bool HTMLTokenizer::continueProcessing(int &processedCount) 1623 { 1624 // We don't want to be checking elapsed time with every character, so we only check after we've 1625 // processed a certain number of characters. We also do not do suspension if we're 1626 // parsing something like innerHTML. 1627 if (!m_executingScript && processedCount > sTokenizerChunkSize && cachedScript.isEmpty()) { 1628 processedCount = 0; 1629 if (m_time.elapsed() > m_tokenizerYieldDelay && m_documentTokenizer) { 1630 m_yieldTimer = startTimer(0); 1631 m_tokenizerYieldDelay = sTokenizerFastYieldDelay; 1632 return false; 1633 } 1634 } 1635 processedCount++; 1636 return true; 1637 } 1638 1639 #include "khtmlpart_p.h" 1640 void HTMLTokenizer::write(const TokenizerString &str, bool appendData) 1641 { 1642 #ifdef TOKEN_DEBUG 1643 qCDebug(KHTML_LOG) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")"; 1644 #endif 1645 if (!buffer) { 1646 return; 1647 } 1648 1649 if ((m_executingScript && appendData) || cachedScript.count()) { 1650 // don't parse; we will do this later 1651 if (pendingQueue.isEmpty()) { 1652 pendingQueue.push(str); 1653 } else if (appendData) { 1654 pendingQueue.bottom().append(str); 1655 } else { 1656 pendingQueue.top().append(str); 1657 } 1658 #if PROSPECTIVE_TOKENIZER_ENABLED 1659 if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData) { 1660 m_prospectiveTokenizer->write(str); 1661 } 1662 #endif 1663 return; 1664 } 1665 1666 #if PROSPECTIVE_TOKENIZER_ENABLED 1667 if (m_prospectiveTokenizer && m_prospectiveTokenizer->inProgress() && appendData) { 1668 m_prospectiveTokenizer->end(); 1669 } 1670 #endif 1671 1672 if (onHold) { 1673 src.append(str); 1674 return; 1675 } 1676 1677 if (!src.isEmpty()) { 1678 src.append(str); 1679 } else { 1680 setSrc(str); 1681 } 1682 1683 // Once a timer is set, it has control of when the tokenizer continues. 1684 if (m_yieldTimer > 0) { 1685 return; 1686 } 1687 1688 int processedCount = 0; 1689 m_time.start(); 1690 1691 while (!src.isEmpty()) { 1692 if (m_abort || !continueProcessing(processedCount)) { 1693 break; 1694 } 1695 // do we need to enlarge the buffer? 1696 checkBuffer(); 1697 1698 ushort cc = src->unicode(); 1699 1700 if (skipLF && (cc != '\n')) { 1701 skipLF = false; 1702 } 1703 1704 if (skipLF) { 1705 skipLF = false; 1706 ++src; 1707 } else if (Entity) { 1708 parseEntity(src, dest); 1709 } else if (plaintext) { 1710 parseText(src); 1711 } else if (script) { 1712 parseRawContent(src); 1713 } else if (style) { 1714 parseRawContent(src); 1715 } else if (xmp) { 1716 parseRawContent(src); 1717 } else if (textarea) { 1718 parseRawContent(src); 1719 } else if (title) { 1720 parseRawContent(src); 1721 } else if (comment) { 1722 parseComment(src); 1723 } else if (doctypeComment && doctypeComment != DoctypeCommentEnd && doctypeComment != DoctypeCommentBogus) { 1724 parseDoctypeComment(src); 1725 } else if (doctype) { 1726 parseDoctype(src); 1727 } else if (server) { 1728 parseServer(src); 1729 } else if (processingInstruction) { 1730 parseProcessingInstruction(src); 1731 } else if (tag) { 1732 parseTag(src); 1733 } else if (startTag) { 1734 startTag = false; 1735 1736 switch (cc) { 1737 case '/': 1738 break; 1739 case '!': { 1740 // <!-- comment --> or <!DOCTYPE ...> 1741 searchCount = 1; // Look for '<!--' sequence to start comment... 1742 doctypeSearchCount = 1; // ... or for '<!DOCTYPE' sequence to start doctype 1743 break; 1744 } 1745 case '?': { 1746 // xml processing instruction 1747 processingInstruction = true; 1748 tquote = NoQuote; 1749 parseProcessingInstruction(src); 1750 continue; 1751 } 1752 case '%': 1753 if (!brokenServer) { 1754 // <% server stuff, handle as comment %> 1755 server = true; 1756 tquote = NoQuote; 1757 parseServer(src); 1758 continue; 1759 } 1760 // else fall through 1761 default: { 1762 if (((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) { 1763 // Start of a Start-Tag 1764 } else { 1765 // Invalid tag 1766 // Add as is 1767 if (pending) { 1768 addPending(); 1769 } 1770 *dest = '<'; 1771 dest++; 1772 continue; 1773 } 1774 } 1775 }; // end case 1776 1777 // According to SGML any LF immediately after a starttag, or 1778 // immediately before an endtag should be ignored. 1779 // ### Gecko and MSIE though only ignores LF immediately after 1780 // starttags and only for PRE elements -- asj (28/06-2005) 1781 if (pending) { 1782 if (!select) { 1783 addPending(); 1784 } else { 1785 pending = NonePending; 1786 } 1787 } 1788 1789 // Cancel unused discards 1790 discard = NoneDiscard; 1791 // if (!endTag) discard = LFDiscard; 1792 1793 processToken(); 1794 1795 cBufferPos = 0; 1796 tag = TagName; 1797 parseTag(src); 1798 } else if (cc == '&' && !src.escaped()) { 1799 ++src; 1800 if (pending) { 1801 addPending(); 1802 } 1803 discard = NoneDiscard; 1804 parseEntity(src, dest, true); 1805 } else if (cc == '<' && !src.escaped()) { 1806 tagStartLineno = lineno + src.lineCount(); 1807 ++src; 1808 discard = NoneDiscard; 1809 startTag = true; 1810 } else if ((cc == '\n') || (cc == '\r')) { 1811 if (discard == SpaceDiscard) { 1812 discard = NoneDiscard; 1813 } 1814 1815 if (discard == LFDiscard) { 1816 // Ignore one LF 1817 discard = NoneDiscard; 1818 } else if (discard == AllDiscard) { 1819 // Ignore 1820 } else { 1821 if (select && !script) { 1822 pending = LFPending; 1823 } else { 1824 if (pending) { 1825 addPending(); 1826 } 1827 pending = LFPending; 1828 } 1829 } 1830 1831 /* Check for MS-DOS CRLF sequence */ 1832 if (cc == '\r') { 1833 skipLF = true; 1834 } 1835 ++src; 1836 } else if ((cc == ' ') || (cc == '\t')) { 1837 if (discard == LFDiscard) { 1838 discard = NoneDiscard; 1839 } 1840 1841 if (discard == SpaceDiscard) { 1842 // Ignore one space 1843 discard = NoneDiscard; 1844 } else if (discard == AllDiscard) { 1845 // Ignore 1846 } else { 1847 if (select && !script) { 1848 if (!pending) { 1849 pending = SpacePending; 1850 } 1851 } else { 1852 if (pending) { 1853 addPending(); 1854 } 1855 if (cc == ' ') { 1856 pending = SpacePending; 1857 } else { 1858 pending = TabPending; 1859 } 1860 } 1861 } 1862 1863 ++src; 1864 } else { 1865 if (pending) { 1866 addPending(); 1867 } 1868 1869 discard = NoneDiscard; 1870 if (pre) { 1871 prePos++; 1872 } 1873 *dest = *src; 1874 fixUpChar(*dest); 1875 ++dest; 1876 ++src; 1877 } 1878 } 1879 1880 if (noMoreData && cachedScript.isEmpty() && !m_executingScript && m_yieldTimer <= 0) { 1881 end(); // this actually causes us to be deleted 1882 } 1883 } 1884 1885 void HTMLTokenizer::timerEvent(QTimerEvent *e) 1886 { 1887 if (e->timerId() == m_yieldTimer) { 1888 killTimer(m_yieldTimer); 1889 m_yieldTimer = 0; 1890 write(TokenizerString(), true); 1891 } else if (e->timerId() == m_externalScriptsTimerId) { 1892 if (view && view->hasLayoutPending()) { 1893 // all stylesheets are loaded but the style modifications 1894 // they triggered have yet to be applied, BBIAB 1895 return; 1896 } 1897 killTimer(m_externalScriptsTimerId); 1898 m_externalScriptsTimerId = 0; 1899 notifyFinished(nullptr); 1900 } 1901 } 1902 1903 void HTMLTokenizer::end() 1904 { 1905 if (buffer) { 1906 // parseTag is using the buffer for different matters 1907 if (!tag) { 1908 processToken(); 1909 } 1910 1911 if (buffer) { 1912 KHTML_DELETE_QCHAR_VEC(buffer); 1913 } 1914 1915 if (rawContent) { 1916 KHTML_DELETE_QCHAR_VEC(rawContent); 1917 } 1918 1919 rawContent = nullptr; 1920 rawContentSize = rawContentMaxSize = rawContentResync = 0; 1921 buffer = nullptr; 1922 } 1923 emit finishedParsing(); 1924 } 1925 1926 void HTMLTokenizer::finish() 1927 { 1928 // The purpose of this iteration is to recover from 'raw content' tokenizing mode. 1929 // In this mode, any error such as the lack of a closing tag (for the considered element) or of a closing comment, 1930 // would result in the entire document being absorbed in one node. 1931 // When it happens, we simply put back in the input buffer what this mode's output has accumulated so far, 1932 // and retokenize after either disabling the 'raw content' mode (by setting the corresponding members to false) 1933 // or after setting a few flags disabling some lax parsing 'features' (brokenComments/brokenServer). 1934 while ((title || comment || server) && rawContent && rawContentSize) { 1935 // we've found an unmatched comment start 1936 if (comment) { 1937 brokenComments = true; 1938 } else if (server) { 1939 brokenServer = true; 1940 } 1941 1942 checkRawContentBuffer(); 1943 rawContent[ rawContentSize ] = 0; 1944 rawContent[ rawContentSize + 1 ] = 0; 1945 int pos; 1946 QString food; 1947 if (title || style || script || textarea) { 1948 rawContentSinceLastEntity = 0; 1949 food.setUnicode(rawContent, rawContentSize); 1950 } else if (server) { 1951 food = "<"; 1952 food += QString(rawContent, rawContentSize); 1953 } else { 1954 pos = QString::fromRawData(rawContent, rawContentSize).indexOf('>'); 1955 food.setUnicode(rawContent + pos + 1, rawContentSize - pos - 1); // deep copy 1956 } 1957 KHTML_DELETE_QCHAR_VEC(rawContent); 1958 rawContent = nullptr; 1959 rawContentSize = rawContentMaxSize = rawContentResync = 0; 1960 1961 comment = server = title = false; 1962 if (!food.isEmpty()) { 1963 write(food, true); 1964 } 1965 } 1966 // this indicates we will not receive any more data... but if we are waiting on 1967 // an external script to load, we can't finish parsing until that is done 1968 noMoreData = true; 1969 if (cachedScript.isEmpty() && !m_executingScript && !onHold && m_yieldTimer <= 0) { 1970 end(); // this actually causes us to be deleted 1971 } 1972 } 1973 1974 void HTMLTokenizer::processToken() 1975 { 1976 KJSProxy *jsProxy = view ? view->part()->jScript() : nullptr; 1977 if (jsProxy) { 1978 jsProxy->setEventHandlerLineno(tagStartLineno); 1979 } 1980 if (dest > buffer) { 1981 #if 0 1982 if (currToken.tid) { 1983 qDebug("unexpected token id: %d, str: *%s*", currToken.tid, QString::fromRawData(buffer, dest - buffer).toLatin1().constData()); 1984 assert(0); 1985 } 1986 1987 #endif 1988 currToken.text = new DOMStringImpl(buffer, dest - buffer); 1989 currToken.text->ref(); 1990 if (currToken.tid != ID_COMMENT) { 1991 currToken.tid = ID_TEXT; 1992 } 1993 } else if (!currToken.tid) { 1994 currToken.reset(); 1995 if (jsProxy) { 1996 jsProxy->setEventHandlerLineno(lineno + src.lineCount()); 1997 } 1998 return; 1999 } 2000 2001 dest = buffer; 2002 2003 #ifdef TOKEN_DEBUG 2004 QString text; 2005 bool closing = (currToken.tid > ID_CLOSE_TAG); 2006 int rid = currToken.tid - (closing ? ID_CLOSE_TAG : 0); 2007 if (currToken.text) { 2008 text = QString::fromRawData(currToken.text->s, currToken.text->l); 2009 } 2010 qCDebug(KHTML_LOG) << "Token -->" << LocalName::fromId(localNamePart(rid)).toString() 2011 << "id =" << currToken.tid << "closing =" << closing; 2012 if (currToken.flat) { 2013 qCDebug(KHTML_LOG) << "Token is FLAT!"; 2014 } 2015 if (!text.isNull()) { 2016 qCDebug(KHTML_LOG) << "text: \"" << text << "\""; 2017 } 2018 unsigned long l = currToken.attrs ? currToken.attrs->length() : 0; 2019 2020 if (l) { 2021 qCDebug(KHTML_LOG) << "Attributes: " << l; 2022 for (unsigned long i = 0; i < l; ++i) { 2023 NodeImpl::Id tid = currToken.attrs->idAt(i); 2024 DOMString value = currToken.attrs->valueAt(i); 2025 qCDebug(KHTML_LOG) << " " << tid << " " << LocalName::fromId(localNamePart(tid)).toString() 2026 << "=\"" << value.string() << "\""; 2027 } 2028 } 2029 #endif 2030 2031 // In some cases, parseToken() can cause javascript code to be executed 2032 // (for example, when setting an attribute that causes an event handler 2033 // to be created). So we need to protect against re-entrancy into the parser 2034 m_executingScript++; 2035 2036 // pass the token over to the parser, the parser DOES NOT delete the token 2037 parser->parseToken(&currToken); 2038 2039 m_executingScript--; 2040 2041 if (currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces()) { 2042 discard = NoneDiscard; 2043 } 2044 2045 currToken.reset(); 2046 if (jsProxy) { 2047 jsProxy->setEventHandlerLineno(0); 2048 } 2049 } 2050 2051 void HTMLTokenizer::processDoctypeToken() 2052 { 2053 // qCDebug(KHTML_LOG) << "Process DoctypeToken (name: " << doctypeToken.name << ", publicID: " << doctypeToken.publicID << ", systemID: " << doctypeToken.systemID; 2054 doctypeToken.publicID = doctypeToken.publicID.simplified(); 2055 doctypeToken.systemID = doctypeToken.systemID.simplified(); 2056 parser->parseDoctypeToken(&doctypeToken); 2057 } 2058 2059 HTMLTokenizer::~HTMLTokenizer() 2060 { 2061 reset(); 2062 delete m_prospectiveTokenizer; 2063 delete parser; 2064 } 2065 2066 void HTMLTokenizer::enlargeBuffer(int len) 2067 { 2068 int newsize = qMax(size * 2, size + len); 2069 int oldoffs = (dest - buffer); 2070 2071 buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize); 2072 dest = buffer + oldoffs; 2073 size = newsize; 2074 } 2075 2076 void HTMLTokenizer::enlargeRawContentBuffer(int len) 2077 { 2078 int newsize = qMax(rawContentMaxSize * 2, rawContentMaxSize + len); 2079 rawContent = KHTML_REALLOC_QCHAR_VEC(rawContent, newsize); 2080 rawContentMaxSize = newsize; 2081 } 2082 2083 void HTMLTokenizer::notifyFinished(CachedObject *finishedObj) 2084 { 2085 Q_UNUSED(finishedObj); 2086 assert(!cachedScript.isEmpty()); 2087 // Make external scripts wait for external stylesheets. 2088 // FIXME: This needs to be done for inline scripts too. 2089 m_hasScriptsWaitingForStylesheets = !parser->doc()->haveStylesheetsLoaded(); 2090 if (m_hasScriptsWaitingForStylesheets) { 2091 // qCDebug(KHTML_LOG) << "Delaying script execution until stylesheets have loaded."; 2092 return; 2093 } 2094 // qCDebug(KHTML_LOG) << (finishedObj ? "Processing an external script" : "Continuing processing of delayed external scripts"); 2095 2096 bool done = false; 2097 m_scriptTime.start(); 2098 while (!done && cachedScript.head()->isLoaded()) { 2099 if (!continueProcessingScripts()) { 2100 break; 2101 } 2102 2103 CachedScript *cs = cachedScript.dequeue(); 2104 DOMString scriptSource = cs->script(); 2105 #ifdef TOKEN_DEBUG 2106 qCDebug(KHTML_LOG) << "External script is:" << endl << scriptSource.string(); 2107 #endif 2108 setSrc(TokenizerString()); 2109 2110 // make sure we forget about the script before we execute the new one 2111 // infinite recursion might happen otherwise 2112 QString cachedScriptUrl(cs->url().string()); 2113 cs->deref(this); 2114 2115 scriptExecution(scriptSource.string(), cachedScriptUrl); 2116 2117 done = cachedScript.isEmpty(); 2118 if (done) { 2119 assert(!m_hasScriptsWaitingForStylesheets); 2120 } else if (m_hasScriptsWaitingForStylesheets) { 2121 // flag has changed during the script execution, 2122 // so we need to wait for stylesheets again. 2123 done = true; 2124 } 2125 // 'script' is true when we are called synchronously from 2126 // scriptHandler(). In that case scriptHandler() will take care 2127 // of the pending queue. 2128 if (!script) { 2129 while (pendingQueue.count() > 1) { 2130 TokenizerString t = pendingQueue.pop(); 2131 pendingQueue.top().prepend(t); 2132 } 2133 if (done) { 2134 write(pendingQueue.pop(), false); 2135 } 2136 // we might be deleted at this point, do not 2137 // access any members. 2138 } 2139 } 2140 } 2141 2142 bool HTMLTokenizer::continueProcessingScripts() 2143 { 2144 if (m_externalScriptsTimerId) { 2145 return false; 2146 } 2147 if (m_scriptTime.elapsed() > m_tokenizerYieldDelay && m_documentTokenizer) { 2148 if ((m_externalScriptsTimerId = startTimer(0))) { 2149 return false; 2150 } 2151 } 2152 return true; 2153 } 2154 2155 void HTMLTokenizer::executeScriptsWaitingForStylesheets() 2156 { 2157 assert(parser->doc()->haveStylesheetsLoaded()); 2158 if (m_hasScriptsWaitingForStylesheets) { 2159 notifyFinished(nullptr); 2160 } 2161 } 2162 2163 bool HTMLTokenizer::isWaitingForScripts() const 2164 { 2165 return cachedScript.count(); 2166 } 2167 2168 bool HTMLTokenizer::isExecutingScript() const 2169 { 2170 return (m_executingScript > 0); 2171 } 2172 2173 void HTMLTokenizer::setSrc(const TokenizerString &source) 2174 { 2175 lineno += src.lineCount(); 2176 src = source; 2177 src.resetLineCount(); 2178 } 2179 2180 void HTMLTokenizer::setOnHold(bool _onHold) 2181 { 2182 if (onHold == _onHold) { 2183 return; 2184 } 2185 onHold = _onHold; 2186 } 2187