Warning, file /frameworks/khtml/src/html/htmltokenizer.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     Copyright (C) 1997 Martin Jones (mjones@kde.org)
0005               (C) 1997 Torben Weis (weis@kde.org)
0006               (C) 1998 Waldo Bastian (bastian@kde.org)
0007               (C) 2001 Dirk Mueller (mueller@kde.org)
0008 
0009     This library is free software; you can redistribute it and/or
0010     modify it under the terms of the GNU Library General Public
0011     License as published by the Free Software Foundation; either
0012     version 2 of the License, or (at your option) any later version.
0013 
0014     This library is distributed in the hope that it will be useful,
0015     but WITHOUT ANY WARRANTY; without even the implied warranty of
0016     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0017     Library General Public License for more details.
0018 
0019     You should have received a copy of the GNU Library General Public License
0020     along with this library; see the file COPYING.LIB.  If not, write to
0021     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0022     Boston, MA 02110-1301, USA.
0023 */
0024 //----------------------------------------------------------------------------
0025 //
0026 // KDE HTML Widget -- Tokenizers
0027 
0028 #ifndef HTMLTOKENIZER_H
0029 #define HTMLTOKENIZER_H
0030 
0031 #include <QString>
0032 #include <QObject>
0033 #include <QQueue>
0034 #include <QTime>
0035 
0036 #include "misc/loader_client.h"
0037 #include "misc/stringit.h"
0038 #include "xml/dom_stringimpl.h"
0039 #include "xml/xml_tokenizer.h"
0040 #include "xml/dom_elementimpl.h"
0041 #include "xml/dom_docimpl.h"
0042 
0043 class KCharsets;
0044 class KHTMLView;
0045 
0046 namespace DOM
0047 {
0048 class DocumentImpl;
0049 class DocumentFragmentImpl;
0050 }
0051 
0052 namespace khtml
0053 {
0054 class CachedScript;
0055 class KHTMLParser;
0056 class ProspectiveTokenizer;
0057 
0058 /**
0059  * @internal
0060  * represents one HTML tag. Consists of a numerical id, and the list
0061  * of attributes. Can also represent text. In this case the id = 0 and
0062  * text contains the text.
0063  */
0064 class Token
0065 {
0066 public:
0067     Token()
0068     {
0069         tid = 0;
0070         attrs = nullptr;
0071         text = nullptr;
0072         flat = false;
0073         //qDebug("new token, creating %08lx", attrs);
0074     }
0075     ~Token()
0076     {
0077         if (attrs) {
0078             attrs->deref();
0079         }
0080         if (text) {
0081             text->deref();
0082         }
0083     }
0084     void addAttribute(DocumentImpl * /*doc*/, QChar *buffer, const DOMString &_attrName, const DOMString &v)
0085     {
0086         DOMStringImpl *value = v.implementation();
0087         LocalName localname = LocalName::fromId(0);
0088         PrefixName prefixname = PrefixName::fromId(emptyPrefix);
0089         if (buffer->unicode()) {
0090             localname = LocalName::fromId(buffer->unicode());
0091         } else if (!_attrName.isEmpty() && _attrName != "/") {
0092             splitPrefixLocalName(_attrName, prefixname, localname, true /* htmlCompat*/);
0093         }
0094 
0095         if (value && localname.id()) {
0096             if (!attrs) {
0097                 attrs = new DOM::NamedAttrMapImpl(nullptr);
0098                 attrs->ref();
0099             }
0100             if (!attrs->getValue(makeId(emptyNamespace, localname.id()), prefixname))
0101                 // place attributes in the empty namespace
0102             {
0103                 attrs->setValue(makeId(emptyNamespace, localname.id()), value, prefixname);
0104             }
0105         }
0106     }
0107     void reset()
0108     {
0109         if (attrs) {
0110             attrs->deref();
0111             attrs = nullptr;
0112         }
0113         tid = 0;
0114         if (text) {
0115             text->deref();
0116             text = nullptr;
0117         }
0118         flat = false;
0119     }
0120     DOM::NamedAttrMapImpl *attrs;
0121     DOMStringImpl *text;
0122     ushort tid;
0123     bool flat;
0124 };
0125 
0126 enum DoctypeState {
0127     DoctypeBegin,
0128     DoctypeBeforeName,
0129     DoctypeName,
0130     DoctypeAfterName,
0131     DoctypeBeforePublicID,
0132     DoctypePublicID,
0133     DoctypeAfterPublicID,
0134     DoctypeBeforeSystemID,
0135     DoctypeSystemID,
0136     DoctypeAfterSystemID,
0137     DoctypeInternalSubset,
0138     DoctypeAfterInternalSubset,
0139     DoctypeBogus
0140 };
0141 
0142 class DoctypeToken
0143 {
0144 public:
0145     DoctypeToken() {}
0146 
0147     void reset()
0148     {
0149         name.clear();
0150         publicID.clear();
0151         systemID.clear();
0152         internalSubset.clear();
0153         state = DoctypeBegin;
0154     }
0155 
0156     DoctypeState state;
0157     QString name;
0158     QString publicID;
0159     QString systemID;
0160     QString internalSubset;
0161 };
0162 
0163 // The count of spaces used for each tab.
0164 #define TAB_SIZE 8
0165 
0166 //-----------------------------------------------------------------------------
0167 
0168 class HTMLTokenizer : public Tokenizer, public CachedObjectClient
0169 {
0170     friend class KHTMLParser;
0171 public:
0172     HTMLTokenizer(DOM::DocumentImpl *, KHTMLView * = nullptr);
0173     HTMLTokenizer(DOM::DocumentImpl *, DOM::DocumentFragmentImpl *frag);
0174     virtual ~HTMLTokenizer();
0175 
0176     void begin() override;
0177     void write(const khtml::TokenizerString &str, bool appendData) override;
0178     void end() override;
0179     void finish() override;
0180     void timerEvent(QTimerEvent *e) override;
0181     bool continueProcessing(int &);
0182     void setNormalYieldDelay() override;
0183     void setOnHold(bool _onHold) override;
0184     void abort() override
0185     {
0186         m_abort = true;
0187     }
0188     bool isWaitingForScripts() const override;
0189     bool isExecutingScript() const override;
0190 
0191     void executeScriptsWaitingForStylesheets() override;
0192 
0193 protected:
0194     void reset();
0195     void addPending();
0196     void processToken();
0197     void processDoctypeToken();
0198     void processListing(khtml::TokenizerString list);
0199 
0200     void parseComment(khtml::TokenizerString &str);
0201     void parseDoctype(khtml::TokenizerString &str);
0202     void parseDoctypeComment(khtml::TokenizerString &str);
0203     void parseServer(khtml::TokenizerString &str);
0204     void parseText(khtml::TokenizerString &str);
0205     void parseListing(khtml::TokenizerString &str);
0206     void parseRawContent(khtml::TokenizerString &str);
0207     void parseTag(khtml::TokenizerString &str);
0208     void parseEntity(khtml::TokenizerString &str, QChar *&dest, bool start = false);
0209     void parseProcessingInstruction(khtml::TokenizerString &str);
0210     void scriptHandler();
0211     void scriptExecution(const QString &script, const QString &scriptURL = QString(), int baseLine = 0);
0212     void setSrc(const TokenizerString &source);
0213 
0214     // check if we have enough space in the buffer.
0215     // if not enlarge it
0216     inline void checkBuffer(int len = 10)
0217     {
0218         if ((dest - buffer) > size - len) {
0219             enlargeBuffer(len);
0220         }
0221     }
0222     inline void checkRawContentBuffer(int len = 10)
0223     {
0224         if (rawContentSize + len >= rawContentMaxSize) {
0225             enlargeRawContentBuffer(len);
0226         }
0227     }
0228 
0229     void enlargeBuffer(int len);
0230     void enlargeRawContentBuffer(int len);
0231 
0232     // from CachedObjectClient
0233     void notifyFinished(khtml::CachedObject *finishedObj) override;
0234 
0235     bool continueProcessingScripts();
0236 protected:
0237     // Internal buffers
0238     ///////////////////
0239     QChar *buffer;
0240     QChar *dest;
0241 
0242     khtml::Token currToken;
0243     LocalName safeLocalName;
0244 
0245     // the size of buffer
0246     int size;
0247 
0248     // Tokenizer flags
0249     //////////////////
0250     // are we in quotes within a html tag
0251     enum {
0252         NoQuote = 0,
0253         SingleQuote,
0254         DoubleQuote
0255     } tquote;
0256 
0257     enum {
0258         NonePending = 0,
0259         SpacePending,
0260         LFPending,
0261         TabPending
0262     } pending;
0263 
0264     enum {
0265         NoneDiscard = 0,
0266         SpaceDiscard,   // Discard spaces after '=' within tags
0267         LFDiscard,  // Discard line breaks immediately after start-tags
0268         AllDiscard  // discard all spaces, LF's etc until next non white char
0269     } discard;
0270 
0271     // Discard the LF part of CRLF sequence
0272     bool skipLF;
0273 
0274     // Flag to say that we have the '<' but not the character following it.
0275     bool startTag;
0276 
0277     // Flag to say, we are just parsing a tag, meaning, we are in the middle
0278     // of <tag...
0279     enum {
0280         NoTag = 0,
0281         TagName,
0282         SearchAttribute,
0283         AttributeName,
0284         SearchEqual,
0285         SearchValue,
0286         QuotedValue,
0287         Value,
0288         SearchEnd
0289     } tag;
0290 
0291     // Are we in a &... character entity description?
0292     enum {
0293         NoEntity = 0,
0294         SearchEntity,
0295         NumericSearch,
0296         Hexadecimal,
0297         Decimal,
0298         EntityName,
0299         SearchSemicolon
0300     } Entity;
0301 
0302     // are we in a <script> ... </script> block
0303     bool script;
0304 
0305     QChar EntityChar;
0306 
0307     // Are we in a <pre> ... </pre> block
0308     bool pre;
0309 
0310     // if 'pre == true' we track in which column we are
0311     int prePos;
0312 
0313     // Are we in a <style> ... </style> block
0314     bool style;
0315 
0316     // Are we in a <select> ... </select> block
0317     bool select;
0318 
0319     // Are we in a <xmp> ... </xmp> block
0320     bool xmp;
0321 
0322     // Are we in a <title> ... </title> block
0323     bool title;
0324 
0325     // Are we in plain textmode ?
0326     bool plaintext;
0327 
0328     // XML processing instructions. Ignored at the moment
0329     bool processingInstruction;
0330 
0331     // Area we in a <!-- comment --> block
0332     bool comment;
0333 
0334     // Are we in a <textarea> ... </textarea> block
0335     bool textarea;
0336 
0337     // was the previous character escaped ?
0338     bool escaped;
0339 
0340     // are we in a server includes statement?
0341     bool server;
0342 
0343     bool brokenServer;
0344 
0345     // doctype parsing from WebCore + internal subset checker and comments in doctype
0346     // are we in <!DOCTYPE ...> block?
0347     bool doctype;
0348     DoctypeToken doctypeToken;
0349     int doctypeSearchCount;
0350     int doctypeSecondarySearchCount;
0351     bool doctypeAllowComment; // is comment allowed in current doctype state?
0352 
0353     // are we in <!DOCTYPE -- ... --> block?
0354     enum {
0355         NoDoctypeComment = 0,
0356         DoctypeCommentHalfBegin,
0357         DoctypeComment,
0358         DoctypeCommentHalfEnd,
0359         DoctypeCommentEnd,
0360         DoctypeCommentBogus
0361     } doctypeComment;
0362 
0363     // name of an unknown attribute
0364     DOMString attrName;
0365 
0366     // Used to store the content of
0367     QChar *rawContent;
0368     // Size of the script sequenze stored in rawContent
0369     int rawContentSize;
0370     // Maximal size that can be stored in rawContent
0371     int rawContentMaxSize;
0372     // resync point of script code size
0373     int rawContentResync;
0374     // this tracks the number of advances done in 'raw' tokenizing
0375     // mode since we last decoded an entity.
0376     int rawContentSinceLastEntity;
0377     // Stores characters if we are scanning for a string like "</script>"
0378     QChar searchBuffer[ 10 ];
0379     // Counts where we are in the string we are scanning for
0380     int searchCount;
0381     // The string we are searching for
0382     const QChar *searchFor;
0383     // the stopper string
0384     const char *searchStopper;
0385     // the stopper len
0386     int searchStopperLen;
0387     // if no more data is coming, just parse what we have (including ext scripts that
0388     // may be still downloading) and finish
0389     bool noMoreData;
0390     // URL to get source code of script from
0391     QString scriptSrc;
0392     QString scriptSrcCharset;
0393     bool javascript;
0394     // the HTML code we will parse after the external script we are waiting for has loaded
0395     TokenizerQueue pendingQueue;
0396     // true if we are executing a script while parsing a document. This causes the parsing of
0397     // the output of the script to be postponed until after the script has finished executing
0398     int m_executingScript;
0399 
0400     int m_externalScriptsTimerId;
0401     bool m_hasScriptsWaitingForStylesheets;
0402 
0403     QQueue<khtml::CachedScript *> cachedScript;
0404     // you can pause the tokenizer if you need to display a dialog or something
0405     bool onHold;
0406     // you can ask the tokenizer to abort the current write() call, e.g. to redirect somewhere else
0407     bool m_abort;
0408     // if we found one broken comment, there are most likely others as well
0409     // store a flag to get rid of the O(n^2) behavior in such a case.
0410     bool brokenComments;
0411     // current line number
0412     int lineno;
0413     // line number at which the current <script> started
0414     int scriptStartLineno;
0415     int tagStartLineno;
0416     int m_tokenizerYieldDelay;
0417     int m_yieldTimer;
0418     QTime m_time;
0419     QTime m_scriptTime;
0420 
0421     // Set true if this tokenizer is used for documents and not fragments
0422     bool m_documentTokenizer;
0423 
0424 #define CBUFLEN 1024
0425     char cBuffer[CBUFLEN + 2];
0426     unsigned int cBufferPos;
0427     unsigned int entityLen;
0428 
0429     khtml::TokenizerString src;
0430 
0431     KCharsets *charsets;
0432     KHTMLParser *parser;
0433 
0434     KHTMLView *view;
0435 
0436     khtml::ProspectiveTokenizer *m_prospectiveTokenizer;
0437 };
0438 
0439 } // namespace
0440 
0441 #endif // HTMLTOKENIZER
0442