Warning, file /frameworks/khtml/src/html/htmltokenizer.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 /* 0002 This file is part of the KDE libraries 0003 0004 Copyright (C) 1997 Martin Jones (mjones@kde.org) 0005 (C) 1997 Torben Weis (weis@kde.org) 0006 (C) 1998 Waldo Bastian (bastian@kde.org) 0007 (C) 2001 Dirk Mueller (mueller@kde.org) 0008 0009 This library is free software; you can redistribute it and/or 0010 modify it under the terms of the GNU Library General Public 0011 License as published by the Free Software Foundation; either 0012 version 2 of the License, or (at your option) any later version. 0013 0014 This library is distributed in the hope that it will be useful, 0015 but WITHOUT ANY WARRANTY; without even the implied warranty of 0016 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0017 Library General Public License for more details. 0018 0019 You should have received a copy of the GNU Library General Public License 0020 along with this library; see the file COPYING.LIB. If not, write to 0021 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0022 Boston, MA 02110-1301, USA. 0023 */ 0024 //---------------------------------------------------------------------------- 0025 // 0026 // KDE HTML Widget -- Tokenizers 0027 0028 #ifndef HTMLTOKENIZER_H 0029 #define HTMLTOKENIZER_H 0030 0031 #include <QString> 0032 #include <QObject> 0033 #include <QQueue> 0034 #include <QTime> 0035 0036 #include "misc/loader_client.h" 0037 #include "misc/stringit.h" 0038 #include "xml/dom_stringimpl.h" 0039 #include "xml/xml_tokenizer.h" 0040 #include "xml/dom_elementimpl.h" 0041 #include "xml/dom_docimpl.h" 0042 0043 class KCharsets; 0044 class KHTMLView; 0045 0046 namespace DOM 0047 { 0048 class DocumentImpl; 0049 class DocumentFragmentImpl; 0050 } 0051 0052 namespace khtml 0053 { 0054 class CachedScript; 0055 class KHTMLParser; 0056 class ProspectiveTokenizer; 0057 0058 /** 0059 * @internal 0060 * represents one HTML tag. Consists of a numerical id, and the list 0061 * of attributes. Can also represent text. In this case the id = 0 and 0062 * text contains the text. 0063 */ 0064 class Token 0065 { 0066 public: 0067 Token() 0068 { 0069 tid = 0; 0070 attrs = nullptr; 0071 text = nullptr; 0072 flat = false; 0073 //qDebug("new token, creating %08lx", attrs); 0074 } 0075 ~Token() 0076 { 0077 if (attrs) { 0078 attrs->deref(); 0079 } 0080 if (text) { 0081 text->deref(); 0082 } 0083 } 0084 void addAttribute(DocumentImpl * /*doc*/, QChar *buffer, const DOMString &_attrName, const DOMString &v) 0085 { 0086 DOMStringImpl *value = v.implementation(); 0087 LocalName localname = LocalName::fromId(0); 0088 PrefixName prefixname = PrefixName::fromId(emptyPrefix); 0089 if (buffer->unicode()) { 0090 localname = LocalName::fromId(buffer->unicode()); 0091 } else if (!_attrName.isEmpty() && _attrName != "/") { 0092 splitPrefixLocalName(_attrName, prefixname, localname, true /* htmlCompat*/); 0093 } 0094 0095 if (value && localname.id()) { 0096 if (!attrs) { 0097 attrs = new DOM::NamedAttrMapImpl(nullptr); 0098 attrs->ref(); 0099 } 0100 if (!attrs->getValue(makeId(emptyNamespace, localname.id()), prefixname)) 0101 // place attributes in the empty namespace 0102 { 0103 attrs->setValue(makeId(emptyNamespace, localname.id()), value, prefixname); 0104 } 0105 } 0106 } 0107 void reset() 0108 { 0109 if (attrs) { 0110 attrs->deref(); 0111 attrs = nullptr; 0112 } 0113 tid = 0; 0114 if (text) { 0115 text->deref(); 0116 text = nullptr; 0117 } 0118 flat = false; 0119 } 0120 DOM::NamedAttrMapImpl *attrs; 0121 DOMStringImpl *text; 0122 ushort tid; 0123 bool flat; 0124 }; 0125 0126 enum DoctypeState { 0127 DoctypeBegin, 0128 DoctypeBeforeName, 0129 DoctypeName, 0130 DoctypeAfterName, 0131 DoctypeBeforePublicID, 0132 DoctypePublicID, 0133 DoctypeAfterPublicID, 0134 DoctypeBeforeSystemID, 0135 DoctypeSystemID, 0136 DoctypeAfterSystemID, 0137 DoctypeInternalSubset, 0138 DoctypeAfterInternalSubset, 0139 DoctypeBogus 0140 }; 0141 0142 class DoctypeToken 0143 { 0144 public: 0145 DoctypeToken() {} 0146 0147 void reset() 0148 { 0149 name.clear(); 0150 publicID.clear(); 0151 systemID.clear(); 0152 internalSubset.clear(); 0153 state = DoctypeBegin; 0154 } 0155 0156 DoctypeState state; 0157 QString name; 0158 QString publicID; 0159 QString systemID; 0160 QString internalSubset; 0161 }; 0162 0163 // The count of spaces used for each tab. 0164 #define TAB_SIZE 8 0165 0166 //----------------------------------------------------------------------------- 0167 0168 class HTMLTokenizer : public Tokenizer, public CachedObjectClient 0169 { 0170 friend class KHTMLParser; 0171 public: 0172 HTMLTokenizer(DOM::DocumentImpl *, KHTMLView * = nullptr); 0173 HTMLTokenizer(DOM::DocumentImpl *, DOM::DocumentFragmentImpl *frag); 0174 virtual ~HTMLTokenizer(); 0175 0176 void begin() override; 0177 void write(const khtml::TokenizerString &str, bool appendData) override; 0178 void end() override; 0179 void finish() override; 0180 void timerEvent(QTimerEvent *e) override; 0181 bool continueProcessing(int &); 0182 void setNormalYieldDelay() override; 0183 void setOnHold(bool _onHold) override; 0184 void abort() override 0185 { 0186 m_abort = true; 0187 } 0188 bool isWaitingForScripts() const override; 0189 bool isExecutingScript() const override; 0190 0191 void executeScriptsWaitingForStylesheets() override; 0192 0193 protected: 0194 void reset(); 0195 void addPending(); 0196 void processToken(); 0197 void processDoctypeToken(); 0198 void processListing(khtml::TokenizerString list); 0199 0200 void parseComment(khtml::TokenizerString &str); 0201 void parseDoctype(khtml::TokenizerString &str); 0202 void parseDoctypeComment(khtml::TokenizerString &str); 0203 void parseServer(khtml::TokenizerString &str); 0204 void parseText(khtml::TokenizerString &str); 0205 void parseListing(khtml::TokenizerString &str); 0206 void parseRawContent(khtml::TokenizerString &str); 0207 void parseTag(khtml::TokenizerString &str); 0208 void parseEntity(khtml::TokenizerString &str, QChar *&dest, bool start = false); 0209 void parseProcessingInstruction(khtml::TokenizerString &str); 0210 void scriptHandler(); 0211 void scriptExecution(const QString &script, const QString &scriptURL = QString(), int baseLine = 0); 0212 void setSrc(const TokenizerString &source); 0213 0214 // check if we have enough space in the buffer. 0215 // if not enlarge it 0216 inline void checkBuffer(int len = 10) 0217 { 0218 if ((dest - buffer) > size - len) { 0219 enlargeBuffer(len); 0220 } 0221 } 0222 inline void checkRawContentBuffer(int len = 10) 0223 { 0224 if (rawContentSize + len >= rawContentMaxSize) { 0225 enlargeRawContentBuffer(len); 0226 } 0227 } 0228 0229 void enlargeBuffer(int len); 0230 void enlargeRawContentBuffer(int len); 0231 0232 // from CachedObjectClient 0233 void notifyFinished(khtml::CachedObject *finishedObj) override; 0234 0235 bool continueProcessingScripts(); 0236 protected: 0237 // Internal buffers 0238 /////////////////// 0239 QChar *buffer; 0240 QChar *dest; 0241 0242 khtml::Token currToken; 0243 LocalName safeLocalName; 0244 0245 // the size of buffer 0246 int size; 0247 0248 // Tokenizer flags 0249 ////////////////// 0250 // are we in quotes within a html tag 0251 enum { 0252 NoQuote = 0, 0253 SingleQuote, 0254 DoubleQuote 0255 } tquote; 0256 0257 enum { 0258 NonePending = 0, 0259 SpacePending, 0260 LFPending, 0261 TabPending 0262 } pending; 0263 0264 enum { 0265 NoneDiscard = 0, 0266 SpaceDiscard, // Discard spaces after '=' within tags 0267 LFDiscard, // Discard line breaks immediately after start-tags 0268 AllDiscard // discard all spaces, LF's etc until next non white char 0269 } discard; 0270 0271 // Discard the LF part of CRLF sequence 0272 bool skipLF; 0273 0274 // Flag to say that we have the '<' but not the character following it. 0275 bool startTag; 0276 0277 // Flag to say, we are just parsing a tag, meaning, we are in the middle 0278 // of <tag... 0279 enum { 0280 NoTag = 0, 0281 TagName, 0282 SearchAttribute, 0283 AttributeName, 0284 SearchEqual, 0285 SearchValue, 0286 QuotedValue, 0287 Value, 0288 SearchEnd 0289 } tag; 0290 0291 // Are we in a &... character entity description? 0292 enum { 0293 NoEntity = 0, 0294 SearchEntity, 0295 NumericSearch, 0296 Hexadecimal, 0297 Decimal, 0298 EntityName, 0299 SearchSemicolon 0300 } Entity; 0301 0302 // are we in a <script> ... </script> block 0303 bool script; 0304 0305 QChar EntityChar; 0306 0307 // Are we in a <pre> ... </pre> block 0308 bool pre; 0309 0310 // if 'pre == true' we track in which column we are 0311 int prePos; 0312 0313 // Are we in a <style> ... </style> block 0314 bool style; 0315 0316 // Are we in a <select> ... </select> block 0317 bool select; 0318 0319 // Are we in a <xmp> ... </xmp> block 0320 bool xmp; 0321 0322 // Are we in a <title> ... </title> block 0323 bool title; 0324 0325 // Are we in plain textmode ? 0326 bool plaintext; 0327 0328 // XML processing instructions. Ignored at the moment 0329 bool processingInstruction; 0330 0331 // Area we in a <!-- comment --> block 0332 bool comment; 0333 0334 // Are we in a <textarea> ... </textarea> block 0335 bool textarea; 0336 0337 // was the previous character escaped ? 0338 bool escaped; 0339 0340 // are we in a server includes statement? 0341 bool server; 0342 0343 bool brokenServer; 0344 0345 // doctype parsing from WebCore + internal subset checker and comments in doctype 0346 // are we in <!DOCTYPE ...> block? 0347 bool doctype; 0348 DoctypeToken doctypeToken; 0349 int doctypeSearchCount; 0350 int doctypeSecondarySearchCount; 0351 bool doctypeAllowComment; // is comment allowed in current doctype state? 0352 0353 // are we in <!DOCTYPE -- ... --> block? 0354 enum { 0355 NoDoctypeComment = 0, 0356 DoctypeCommentHalfBegin, 0357 DoctypeComment, 0358 DoctypeCommentHalfEnd, 0359 DoctypeCommentEnd, 0360 DoctypeCommentBogus 0361 } doctypeComment; 0362 0363 // name of an unknown attribute 0364 DOMString attrName; 0365 0366 // Used to store the content of 0367 QChar *rawContent; 0368 // Size of the script sequenze stored in rawContent 0369 int rawContentSize; 0370 // Maximal size that can be stored in rawContent 0371 int rawContentMaxSize; 0372 // resync point of script code size 0373 int rawContentResync; 0374 // this tracks the number of advances done in 'raw' tokenizing 0375 // mode since we last decoded an entity. 0376 int rawContentSinceLastEntity; 0377 // Stores characters if we are scanning for a string like "</script>" 0378 QChar searchBuffer[ 10 ]; 0379 // Counts where we are in the string we are scanning for 0380 int searchCount; 0381 // The string we are searching for 0382 const QChar *searchFor; 0383 // the stopper string 0384 const char *searchStopper; 0385 // the stopper len 0386 int searchStopperLen; 0387 // if no more data is coming, just parse what we have (including ext scripts that 0388 // may be still downloading) and finish 0389 bool noMoreData; 0390 // URL to get source code of script from 0391 QString scriptSrc; 0392 QString scriptSrcCharset; 0393 bool javascript; 0394 // the HTML code we will parse after the external script we are waiting for has loaded 0395 TokenizerQueue pendingQueue; 0396 // true if we are executing a script while parsing a document. This causes the parsing of 0397 // the output of the script to be postponed until after the script has finished executing 0398 int m_executingScript; 0399 0400 int m_externalScriptsTimerId; 0401 bool m_hasScriptsWaitingForStylesheets; 0402 0403 QQueue<khtml::CachedScript *> cachedScript; 0404 // you can pause the tokenizer if you need to display a dialog or something 0405 bool onHold; 0406 // you can ask the tokenizer to abort the current write() call, e.g. to redirect somewhere else 0407 bool m_abort; 0408 // if we found one broken comment, there are most likely others as well 0409 // store a flag to get rid of the O(n^2) behavior in such a case. 0410 bool brokenComments; 0411 // current line number 0412 int lineno; 0413 // line number at which the current <script> started 0414 int scriptStartLineno; 0415 int tagStartLineno; 0416 int m_tokenizerYieldDelay; 0417 int m_yieldTimer; 0418 QTime m_time; 0419 QTime m_scriptTime; 0420 0421 // Set true if this tokenizer is used for documents and not fragments 0422 bool m_documentTokenizer; 0423 0424 #define CBUFLEN 1024 0425 char cBuffer[CBUFLEN + 2]; 0426 unsigned int cBufferPos; 0427 unsigned int entityLen; 0428 0429 khtml::TokenizerString src; 0430 0431 KCharsets *charsets; 0432 KHTMLParser *parser; 0433 0434 KHTMLView *view; 0435 0436 khtml::ProspectiveTokenizer *m_prospectiveTokenizer; 0437 }; 0438 0439 } // namespace 0440 0441 #endif // HTMLTOKENIZER 0442