src/html/htmlprospectivetokenizer.cpp

0001 /*
0002  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
0003  *           (C) 2008 Germain Garand <germain@ebooksfrance.org>
0004  *
0005  * Redistribution and use in source and binary forms, with or without
0006  * modification, are permitted provided that the following conditions
0007  * are met:
0008  * 1. Redistributions of source code must retain the above copyright
0009  *    notice, this list of conditions and the following disclaimer.
0010  * 2. Redistributions in binary form must reproduce the above copyright
0011  *    notice, this list of conditions and the following disclaimer in the
0012  *    documentation and/or other materials provided with the distribution.
0013  *
0014  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
0015  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0016  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
0017  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
0018  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
0019  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
0020  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
0021  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
0022  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
0024  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025  */
0026
0027 #include "htmlprospectivetokenizer.h"
0028
0029 #include <QTime>
0030 #include <QVarLengthArray>
0031
0032 #include "html_headimpl.h"
0033 #include "html_documentimpl.h"
0034 #include "htmlparser.h"
0035 #include "dtd.h"
0036
0037 #include <misc/loader.h>
0038 #include <khtmlview.h>
0039 #include <khtml_part.h>
0040 #include <xml/dom_docimpl.h>
0041 #include <css/csshelper.h>
0042 #include <ecma/kjs_proxy.h>
0043 #include <ctype.h>
0044 #include <assert.h>
0045 #include <QVariant>
0046 #include <stdlib.h>
0047
0048 #include "kentities_p.h"
0049
0050 #define PRELOAD_DEBUG 0
0051
0052 #define U16_TRAIL(sup) (ushort)(((sup)&0x3ff)|0xdc00)
0053 #define U16_LEAD(sup) (ushort)(((sup)>>10)+0xd7c0)
0054
0055 using namespace khtml;
0056
0057 ProspectiveTokenizer::ProspectiveTokenizer(DOM::DocumentImpl *doc)
0058     : m_inProgress(false)
0059     , m_tagName(32)
0060     , m_attributeName(32)
0061     , m_attributeValue(255)
0062     , m_cssRule(16)
0063     , m_cssRuleValue(255)
0064     , m_timeUsed(0)
0065     , m_document(doc)
0066 {
0067 #if PRELOAD_DEBUG
0068     qCDebug(KHTML_LOG) << "CREATING PRELOAD SCANNER FOR" << m_document << m_document->URL().toDisplayString();
0069 #endif
0070 }
0071
0072 ProspectiveTokenizer::~ProspectiveTokenizer()
0073 {
0074 #if PRELOAD_DEBUG
0075     fprintf(stderr, "DELETING PRELOAD SCANNER FOR %p\n", m_document);
0076     fprintf(stderr, "TOTAL TIME USED %dms\n", m_timeUsed);
0077 #endif
0078 }
0079
0080 void ProspectiveTokenizer::begin()
0081 {
0082     assert(!m_inProgress);
0083     reset();
0084     m_inProgress = true;
0085 }
0086
0087 void ProspectiveTokenizer::end()
0088 {
0089     assert(m_inProgress);
0090     m_inProgress = false;
0091 }
0092
0093 void ProspectiveTokenizer::reset()
0094 {
0095     m_source.clear();
0096
0097     m_state = Data;
0098     m_escape = false;
0099     m_contentModel = PCDATA;
0100     m_commentPos = 0;
0101
0102     m_closeTag = false;
0103     m_tagName.clear();
0104     m_attributeName.clear();
0105     m_attributeValue.clear();
0106     m_lastStartTag.clear();
0107     m_lastStartTagId = 0;
0108
0109     m_urlToLoad = DOMString();
0110     m_linkIsStyleSheet = false;
0111     m_lastCharacterIndex = 0;
0112     clearLastCharacters();
0113
0114     m_cssState = CSSInitial;
0115     m_cssRule.clear();
0116     m_cssRuleValue.clear();
0117 }
0118
0119 void ProspectiveTokenizer::write(const TokenizerString &source)
0120 {
0121 #if PRELOAD_DEBUG
0122     QTime t;
0123     t.start();
0124 #endif
0125
0126     tokenize(source);
0127
0128 #if PRELOAD_DEBUG
0129     m_timeUsed += t.elapsed();
0130 #endif
0131 }
0132
0133 static inline bool isWhitespace(const QChar &c)
0134 {
0135     unsigned short u = c.unicode();
0136     if (u > 0x20) {
0137         return false;
0138     }
0139     return u == ' ' || u == '\n' || u == '\r' || u == '\t';
0140 }
0141
0142 inline void ProspectiveTokenizer::clearLastCharacters()
0143 {
0144     memset(m_lastCharacters, 0, lastCharactersBufferSize * sizeof(QChar));
0145 }
0146
0147 inline void ProspectiveTokenizer::rememberCharacter(QChar c)
0148 {
0149     m_lastCharacterIndex = (m_lastCharacterIndex + 1) % lastCharactersBufferSize;
0150     m_lastCharacters[m_lastCharacterIndex] = c;
0151 }
0152
0153 inline bool ProspectiveTokenizer::lastCharactersMatch(const char *chars, unsigned count) const
0154 {
0155     unsigned pos = m_lastCharacterIndex;
0156     while (count) {
0157         if (chars[count - 1] != m_lastCharacters[pos]) {
0158             return false;
0159         }
0160         --count;
0161         if (!pos) {
0162             pos = lastCharactersBufferSize;
0163         }
0164         --pos;
0165     }
0166     return true;
0167 }
0168
0169 static inline unsigned legalEntityFor(unsigned value)
0170 {
0171     // FIXME There is a table for more exceptions in the HTML5 specification.
0172     if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) {
0173         return 0xFFFD;
0174     }
0175     return value;
0176 }
0177
0178 unsigned ProspectiveTokenizer::consumeEntity(TokenizerString &source, bool &notEnoughCharacters)
0179 {
0180     enum EntityState {
0181         Initial,
0182         NumberType,
0183         MaybeHex,
0184         Hex,
0185         Decimal,
0186         Named
0187     };
0188     EntityState entityState = Initial;
0189     unsigned result = 0;
0190     QVarLengthArray<QChar> seenChars;
0191     QVarLengthArray<char>  entityName;
0192
0193     while (!source.isEmpty()) {
0194         seenChars.append(*source);
0195         ushort cc = source->unicode();
0196         switch (entityState) {
0197         case Initial:
0198             if (isWhitespace(cc) || cc == '<' || cc == '&') {
0199                 return 0;
0200             } else if (cc == '#') {
0201                 entityState = NumberType;
0202             } else if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
0203                 entityName.append(cc);
0204                 entityState = Named;
0205             } else {
0206                 return 0;
0207             }
0208             break;
0209         case NumberType:
0210             if (cc == 'x' || cc == 'X') {
0211                 entityState = MaybeHex;
0212             } else if (cc >= '0' && cc <= '9') {
0213                 entityState = Decimal;
0214                 result = cc - '0';
0215             } else {
0216                 source.push('#');
0217                 return 0;
0218             }
0219             break;
0220         case MaybeHex:
0221             if (cc >= '0' && cc <= '9') {
0222                 result = cc - '0';
0223             } else if (cc >= 'a' && cc <= 'f') {
0224                 result = 10 + cc - 'a';
0225             } else if (cc >= 'A' && cc <= 'F') {
0226                 result = 10 + cc - 'A';
0227             } else {
0228                 source.push(seenChars[1]);
0229                 source.push('#');
0230                 return 0;
0231             }
0232             entityState = Hex;
0233             break;
0234         case Hex:
0235             if (cc >= '0' && cc <= '9') {
0236                 result = result * 16 + cc - '0';
0237             } else if (cc >= 'a' && cc <= 'f') {
0238                 result = result * 16 + 10 + cc - 'a';
0239             } else if (cc >= 'A' && cc <= 'F') {
0240                 result = result * 16 + 10 + cc - 'A';
0241             } else if (cc == ';') {
0242                 source.advance();
0243                 return legalEntityFor(result);
0244             } else {
0245                 return legalEntityFor(result);
0246             }
0247             break;
0248         case Decimal:
0249             if (cc >= '0' && cc <= '9') {
0250                 result = result * 10 + cc - '0';
0251             } else if (cc == ';') {
0252                 source.advance();
0253                 return legalEntityFor(result);
0254             } else {
0255                 return legalEntityFor(result);
0256             }
0257             break;
0258         case Named:
0259             // This is the attribute only version, generic version matches somewhat differently
0260             while (entityName.size() <= 8) {
0261                 if (cc == ';') {
0262                     int code;
0263                     const bool found = kde_findEntity(entityName.data(), entityName.size(), &code);
0264                     if (found) {
0265                         source.advance();
0266                         return code;
0267                     }
0268                     break;
0269                 }
0270                 if (!(cc >= 'a' && cc <= 'z') && !(cc >= 'A' && cc <= 'Z') && !(cc >= '0' && cc <= '9')) {
0271                     int code;
0272                     const bool found = kde_findEntity(entityName.data(), entityName.size(), &code);
0273                     if (found) {
0274                         return code;
0275                     }
0276                     break;
0277                 }
0278                 entityName.append(cc);
0279                 source.advance();
0280                 if (source.isEmpty()) {
0281                     goto outOfCharacters;
0282                 }
0283                 cc = source->unicode();
0284                 seenChars.append(cc);
0285             }
0286             if (seenChars.size() == 2) {
0287                 source.push(seenChars[0]);
0288             } else if (seenChars.size() == 3) {
0289                 source.push(seenChars[1]);
0290                 source.push(seenChars[0]);
0291             } else {
0292                 source.prepend(TokenizerString(QString(seenChars.data(), seenChars.size() - 1)));
0293             }
0294             return 0;
0295         }
0296         source.advance();
0297     }
0298 outOfCharacters:
0299     notEnoughCharacters = true;
0300     source.prepend(TokenizerString(QString(seenChars.data(), seenChars.size())));
0301     return 0;
0302 }
0303
0304 void ProspectiveTokenizer::tokenize(const TokenizerString &source)
0305 {
0306     assert(m_inProgress);
0307
0308     m_source.append(source);
0309
0310     // This is a simplified HTML5 Tokenizer
0311     // https://html.spec.whatwg.org/#tokenization
0312     while (!m_source.isEmpty()) {
0313         ushort cc = m_source->unicode();
0314         switch (m_state) {
0315         case Data:
0316             while (1) {
0317                 rememberCharacter(cc);
0318                 if (cc == '&') {
0319                     if (m_contentModel == PCDATA || m_contentModel == RCDATA) {
0320                         m_state = EntityData;
0321                         break;
0322                     }
0323                 } else if (cc == '-') {
0324                     if ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape) {
0325                         if (lastCharactersMatch("<!--", 4)) {
0326                             m_escape = true;
0327                         }
0328                     }
0329                 } else if (cc == '<') {
0330                     if (m_contentModel == PCDATA || ((m_contentModel == RCDATA || m_contentModel == CDATA) && !m_escape)) {
0331                         m_state = TagOpen;
0332                         break;
0333                     }
0334                 } else if (cc == '>') {
0335                     if ((m_contentModel == RCDATA || m_contentModel == CDATA) && m_escape) {
0336                         if (lastCharactersMatch("-->", 3)) {
0337                             m_escape = false;
0338                         }
0339                     }
0340                 }
0341                 emitCharacter(cc);
0342                 m_source.advance();
0343                 if (m_source.isEmpty()) {
0344                     return;
0345                 }
0346                 cc = m_source->unicode();
0347             }
0348             break;
0349         case EntityData:
0350             // should try to consume the entity but we only care about entities in attributes
0351             m_state = Data;
0352             break;
0353         case TagOpen:
0354             if (m_contentModel == RCDATA || m_contentModel == CDATA) {
0355                 if (cc == '/') {
0356                     m_state = CloseTagOpen;
0357                 } else {
0358                     m_state = Data;
0359                     continue;
0360                 }
0361             } else if (m_contentModel == PCDATA) {
0362                 if (cc == '!') {
0363                     m_state = MarkupDeclarationOpen;
0364                 } else if (cc == '/') {
0365                     m_state = CloseTagOpen;
0366                 } else if (cc >= 'A' && cc <= 'Z') {
0367                     m_tagName.clear();
0368                     m_tagName.append(cc + 0x20);
0369                     m_closeTag = false;
0370                     m_state = TagName;
0371                 } else if (cc >= 'a' && cc <= 'z') {
0372                     m_tagName.clear();
0373                     m_tagName.append(cc);
0374                     m_closeTag = false;
0375                     m_state = TagName;
0376                 } else if (cc == '>') {
0377                     m_state = Data;
0378                 } else if (cc == '?') {
0379                     m_state = BogusComment;
0380                 } else {
0381                     m_state = Data;
0382                     continue;
0383                 }
0384             }
0385             break;
0386         case CloseTagOpen:
0387             if (m_contentModel == RCDATA || m_contentModel == CDATA) {
0388                 if (!m_lastStartTag.size()) {
0389                     m_state = Data;
0390                     continue;
0391                 }
0392                 if ((unsigned)m_source.length() < m_lastStartTag.size() + 1) {
0393                     return;
0394                 }
0395                 QVector<QChar> tmpString;
0396                 QChar tmpChar = 0;
0397                 bool match = true;
0398                 for (unsigned n = 0; n < m_lastStartTag.size() + 1; n++) {
0399                     tmpChar = m_source->toLower();
0400                     if (n < m_lastStartTag.size() && tmpChar != m_lastStartTag[n]) {
0401                         match = false;
0402                     }
0403                     tmpString.append(tmpChar);
0404                     m_source.advance();
0405                 }
0406                 m_source.prepend(TokenizerString(QString(tmpString.data(), tmpString.size())));
0407                 if (!match || (!isWhitespace(tmpChar) && tmpChar != '>' && tmpChar != '/')) {
0408                     m_state = Data;
0409                     continue;
0410                 }
0411             }
0412             if (cc >= 'A' && cc <= 'Z') {
0413                 m_tagName.clear();
0414                 m_tagName.append(cc + 0x20);
0415                 m_closeTag = true;
0416                 m_state = TagName;
0417             } else if (cc >= 'a' && cc <= 'z') {
0418                 m_tagName.clear();
0419                 m_tagName.append(cc);
0420                 m_closeTag = true;
0421                 m_state = TagName;
0422             } else if (cc == '>') {
0423                 m_state = Data;
0424             } else {
0425                 m_state = BogusComment;
0426             }
0427             break;
0428         case TagName:
0429             while (1) {
0430                 if (isWhitespace(cc)) {
0431                     m_state = BeforeAttributeName;
0432                     break;
0433                 }
0434                 if (cc == '>') {
0435                     emitTag();
0436                     m_state = Data;
0437                     break;
0438                 }
0439                 if (cc == '/') {
0440                     m_state = BeforeAttributeName;
0441                     break;
0442                 }
0443                 if (cc >= 'A' && cc <= 'Z') {
0444                     m_tagName.append(cc + 0x20);
0445                 } else {
0446                     m_tagName.append(cc);
0447                 }
0448                 m_source.advance();
0449                 if (m_source.isEmpty()) {
0450                     return;
0451                 }
0452                 cc = m_source->unicode();
0453             }
0454             break;
0455         case BeforeAttributeName:
0456             if (isWhitespace(cc))
0457                 ;
0458             else if (cc == '>') {
0459                 emitTag();
0460                 m_state = Data;
0461             } else if (cc >= 'A' && cc <= 'Z') {
0462                 m_attributeName.clear();
0463                 m_attributeValue.clear();
0464                 m_attributeName.append(cc + 0x20);
0465                 m_state = AttributeName;
0466             } else if (cc == '/')
0467                 ;
0468             else {
0469                 m_attributeName.clear();
0470                 m_attributeValue.clear();
0471                 m_attributeName.append(cc);
0472                 m_state = AttributeName;
0473             }
0474             break;
0475         case AttributeName:
0476             while (1) {
0477                 if (isWhitespace(cc)) {
0478                     m_state = AfterAttributeName;
0479                     break;
0480                 }
0481                 if (cc == '=') {
0482                     m_state = BeforeAttributeValue;
0483                     break;
0484                 }
0485                 if (cc == '>') {
0486                     emitTag();
0487                     m_state = Data;
0488                     break;
0489                 }
0490                 if (cc == '/') {
0491                     m_state = BeforeAttributeName;
0492                     break;
0493                 }
0494                 if (cc >= 'A' && cc <= 'Z') {
0495                     m_attributeName.append(cc + 0x20);
0496                 } else {
0497                     m_attributeName.append(cc);
0498                 }
0499                 m_source.advance();
0500                 if (m_source.isEmpty()) {
0501                     return;
0502                 }
0503                 cc = m_source->unicode();
0504             }
0505             break;
0506         case AfterAttributeName:
0507             if (isWhitespace(cc))
0508                 ;
0509             else if (cc == '=') {
0510                 m_state = BeforeAttributeValue;
0511             } else if (cc == '>') {
0512                 emitTag();
0513                 m_state = Data;
0514             } else if (cc >= 'A' && cc <= 'Z') {
0515                 m_attributeName.clear();
0516                 m_attributeValue.clear();
0517                 m_attributeName.append(cc + 0x20);
0518                 m_state = AttributeName;
0519             } else if (cc == '/') {
0520                 m_state = BeforeAttributeName;
0521             } else {
0522                 m_attributeName.clear();
0523                 m_attributeValue.clear();
0524                 m_attributeName.append(cc);
0525                 m_state = AttributeName;
0526             }
0527             break;
0528         case BeforeAttributeValue:
0529             if (isWhitespace(cc))
0530                 ;
0531             else if (cc == '"') {
0532                 m_state = AttributeValueDoubleQuoted;
0533             } else if (cc == '&') {
0534                 m_state = AttributeValueUnquoted;
0535                 continue;
0536             } else if (cc == '\'') {
0537                 m_state = AttributeValueSingleQuoted;
0538             } else if (cc == '>') {
0539                 emitTag();
0540                 m_state = Data;
0541             } else {
0542                 m_attributeValue.append(cc);
0543                 m_state = AttributeValueUnquoted;
0544             }
0545             break;
0546         case AttributeValueDoubleQuoted:
0547             while (1) {
0548                 if (cc == '"') {
0549                     processAttribute();
0550                     m_state = BeforeAttributeName;
0551                     break;
0552                 }
0553                 if (cc == '&') {
0554                     m_stateBeforeEntityInAttributeValue = m_state;
0555                     m_state = EntityInAttributeValue;
0556                     break;
0557                 }
0558                 m_attributeValue.append(cc);
0559                 m_source.advance();
0560                 if (m_source.isEmpty()) {
0561                     return;
0562                 }
0563                 cc = m_source->unicode();
0564             }
0565             break;
0566         case AttributeValueSingleQuoted:
0567             while (1) {
0568                 if (cc == '\'') {
0569                     processAttribute();
0570                     m_state = BeforeAttributeName;
0571                     break;
0572                 }
0573                 if (cc == '&') {
0574                     m_stateBeforeEntityInAttributeValue = m_state;
0575                     m_state = EntityInAttributeValue;
0576                     break;
0577                 }
0578                 m_attributeValue.append(cc);
0579                 m_source.advance();
0580                 if (m_source.isEmpty()) {
0581                     return;
0582                 }
0583                 cc = m_source->unicode();
0584             }
0585             break;
0586         case AttributeValueUnquoted:
0587             while (1) {
0588                 if (isWhitespace(cc)) {
0589                     processAttribute();
0590                     m_state = BeforeAttributeName;
0591                     break;
0592                 }
0593                 if (cc == '&') {
0594                     m_stateBeforeEntityInAttributeValue = m_state;
0595                     m_state = EntityInAttributeValue;
0596                     break;
0597                 }
0598                 if (cc == '>') {
0599                     processAttribute();
0600                     emitTag();
0601                     m_state = Data;
0602                     break;
0603                 }
0604                 m_attributeValue.append(cc);
0605                 m_source.advance();
0606                 if (m_source.isEmpty()) {
0607                     return;
0608                 }
0609                 cc = m_source->unicode();
0610             }
0611             break;
0612         case EntityInAttributeValue: {
0613             bool notEnoughCharacters = false;
0614             unsigned entity = consumeEntity(m_source, notEnoughCharacters);
0615             if (notEnoughCharacters) {
0616                 return;
0617             }
0618             if (entity > 0xFFFF) {
0619                 m_attributeValue.append(U16_LEAD(entity));
0620                 m_attributeValue.append(U16_TRAIL(entity));
0621             } else if (entity) {
0622                 m_attributeValue.append(entity);
0623             } else {
0624                 m_attributeValue.append('&');
0625             }
0626         }
0627         m_state = m_stateBeforeEntityInAttributeValue;
0628         continue;
0629         case BogusComment:
0630             while (1) {
0631                 if (cc == '>') {
0632                     m_state = Data;
0633                     break;
0634                 }
0635                 m_source.advance();
0636                 if (m_source.isEmpty()) {
0637                     return;
0638                 }
0639                 cc = m_source->unicode();
0640             }
0641             break;
0642         case MarkupDeclarationOpen: {
0643             if (cc == '-') {
0644                 if (m_source.length() < 2) {
0645                     return;
0646                 }
0647                 m_source.advance();
0648                 cc = m_source->unicode();
0649                 if (cc == '-') {
0650                     m_state = CommentStart;
0651                 } else {
0652                     m_state = BogusComment;
0653                     continue;
0654                 }
0655                 // If we cared about the DOCTYPE we would test to enter those states here
0656             } else {
0657                 m_state = BogusComment;
0658                 continue;
0659             }
0660             break;
0661         }
0662         case CommentStart:
0663             if (cc == '-') {
0664                 m_state = CommentStartDash;
0665             } else if (cc == '>') {
0666                 m_state = Data;
0667             } else {
0668                 m_state = Comment;
0669             }
0670             break;
0671         case CommentStartDash:
0672             if (cc == '-') {
0673                 m_state = CommentEnd;
0674             } else if (cc == '>') {
0675                 m_state = Data;
0676             } else {
0677                 m_state = Comment;
0678             }
0679             break;
0680         case Comment:
0681             while (1) {
0682                 if (cc == '-') {
0683                     m_state = CommentEndDash;
0684                     break;
0685                 }
0686                 m_source.advance();
0687                 if (m_source.isEmpty()) {
0688                     return;
0689                 }
0690                 cc = m_source->unicode();
0691             }
0692             break;
0693         case CommentEndDash:
0694             if (cc == '-') {
0695                 m_state = CommentEnd;
0696             } else {
0697                 m_state = Comment;
0698             }
0699             break;
0700         case CommentEnd:
0701             if (cc == '>') {
0702                 m_state = Data;
0703             } else if (cc == '-')
0704                 ;
0705             else {
0706                 m_state = Comment;
0707             }
0708             break;
0709         }
0710         m_source.advance();
0711     }
0712 }
0713
0714 void ProspectiveTokenizer::processAttribute()
0715 {
0716     DOMStringImpl tagNameDS(DOMStringImpl::ShallowCopy, m_tagName.data(), m_tagName.size());
0717     LocalName tagLocal = LocalName::fromString(&tagNameDS, IDS_NormalizeLower);
0718     uint tag = tagLocal.id();
0719
0720     switch (tag) {
0721     case ID_SCRIPT:
0722     case ID_IMAGE:
0723     case ID_IMG: {
0724         DOMStringImpl attrDS(DOMStringImpl::ShallowCopy, m_attributeName.data(), m_attributeName.size());
0725         LocalName attrLocal = LocalName::fromString(&attrDS, IDS_NormalizeLower);
0726         uint attribute = attrLocal.id();
0727         if (attribute == localNamePart(ATTR_SRC) && m_urlToLoad.isEmpty()) {
0728             m_urlToLoad = DOMString(m_attributeValue.data(), m_attributeValue.size()).trimSpaces();
0729         }
0730         break;
0731     }
0732     case ID_LINK: {
0733         DOMStringImpl attrDS(DOMStringImpl::ShallowCopy, m_attributeName.data(), m_attributeName.size());
0734         LocalName attrLocal = LocalName::fromString(&attrDS, IDS_NormalizeLower);
0735         uint attribute = attrLocal.id();
0736         if (attribute == localNamePart(ATTR_HREF) && m_urlToLoad.isEmpty()) {
0737             m_urlToLoad = DOMString(m_attributeValue.data(), m_attributeValue.size()).trimSpaces();
0738         } else if (attribute == localNamePart(ATTR_REL)) {
0739             DOMStringImpl *lowerAttribute = DOMStringImpl(DOMStringImpl::ShallowCopy, m_attributeValue.data(), m_attributeValue.size()).lower();
0740             QString val = lowerAttribute->string();
0741             delete lowerAttribute;
0742             m_linkIsStyleSheet = val.contains("stylesheet") && !val.contains("alternate") && !val.contains("icon");
0743         }
0744     }
0745     default:
0746         break;
0747     }
0748 }
0749
0750 inline void ProspectiveTokenizer::emitCharacter(QChar c)
0751 {
0752     if (m_contentModel == CDATA && m_lastStartTagId == ID_STYLE) {
0753         tokenizeCSS(c);
0754     }
0755 }
0756
0757 inline void ProspectiveTokenizer::tokenizeCSS(QChar c)
0758 {
0759     // We are just interested in @import rules, no need for real tokenization here
0760     // Searching for other types of resources is probably low payoff
0761     switch (m_cssState) {
0762     case CSSInitial:
0763         if (c == '@') {
0764             m_cssState = CSSRuleStart;
0765         } else if (c == '/') {
0766             m_cssState = CSSMaybeComment;
0767         }
0768         break;
0769     case CSSMaybeComment:
0770         if (c == '*') {
0771             m_cssState = CSSComment;
0772         } else {
0773             m_cssState = CSSInitial;
0774         }
0775         break;
0776     case CSSComment:
0777         if (c == '*') {
0778             m_cssState = CSSMaybeCommentEnd;
0779         }
0780         break;
0781     case CSSMaybeCommentEnd:
0782         if (c == '/') {
0783             m_cssState = CSSInitial;
0784         } else if (c == '*')
0785             ;
0786         else {
0787             m_cssState = CSSComment;
0788         }
0789         break;
0790     case CSSRuleStart:
0791         if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
0792             m_cssRule.clear();
0793             m_cssRuleValue.clear();
0794             m_cssRule.append(c);
0795             m_cssState = CSSRule;
0796         } else {
0797             m_cssState = CSSInitial;
0798         }
0799         break;
0800     case CSSRule:
0801         if (isWhitespace(c)) {
0802             m_cssState = CSSAfterRule;
0803         } else if (c == ';') {
0804             m_cssState = CSSInitial;
0805         } else {
0806             m_cssRule.append(c);
0807         }
0808         break;
0809     case CSSAfterRule:
0810         if (isWhitespace(c))
0811             ;
0812         else if (c == ';') {
0813             m_cssState = CSSInitial;
0814         } else {
0815             m_cssState = CSSRuleValue;
0816             m_cssRuleValue.append(c);
0817         }
0818         break;
0819     case CSSRuleValue:
0820         if (isWhitespace(c)) {
0821             m_cssState = CSSAferRuleValue;
0822         } else if (c == ';') {
0823             emitCSSRule();
0824             m_cssState = CSSInitial;
0825         } else {
0826             m_cssRuleValue.append(c);
0827         }
0828         break;
0829     case CSSAferRuleValue:
0830         if (isWhitespace(c))
0831             ;
0832         else if (c == ';') {
0833             emitCSSRule();
0834             m_cssState = CSSInitial;
0835         } else {
0836             // FIXME media rules
0837             m_cssState = CSSInitial;
0838         }
0839         break;
0840     }
0841 }
0842
0843 void ProspectiveTokenizer::emitTag()
0844 {
0845     if (m_closeTag) {
0846         m_contentModel = PCDATA;
0847         m_cssState = CSSInitial;
0848         clearLastCharacters();
0849         return;
0850     }
0851
0852     DOMStringImpl tagNameDS(DOMStringImpl::ShallowCopy, m_tagName.data(), m_tagName.size());
0853     LocalName tagLocal = LocalName::fromString(&tagNameDS, IDS_NormalizeLower);
0854     uint tag = tagLocal.id();
0855     m_lastStartTagId = tag;
0856     m_lastStartTag = m_tagName;
0857
0858     switch (tag) {
0859     case ID_TEXTAREA:
0860     case ID_TITLE:
0861         m_contentModel = RCDATA;
0862         break;
0863     case ID_STYLE:
0864     case ID_XMP:
0865     case ID_SCRIPT:
0866     case ID_IFRAME:
0867     case ID_NOEMBED:
0868     case ID_NOFRAMES:
0869         m_contentModel = CDATA;
0870         break;
0871     case ID_NOSCRIPT:
0872         // we wouldn't be here if scripts were disabled
0873         m_contentModel = CDATA;
0874         break;
0875     case ID_PLAINTEXT:
0876         m_contentModel = PLAINTEXT;
0877         break;
0878     default:
0879         m_contentModel = PCDATA;
0880     }
0881
0882     if (m_urlToLoad.isEmpty()) {
0883         m_linkIsStyleSheet = false;
0884         return;
0885     }
0886
0887     CachedObject *o = nullptr;
0888     if (tag == ID_SCRIPT) {
0889         o = m_document->docLoader()->requestScript(m_urlToLoad, m_document->part()->encoding());
0890     } else if (tag == ID_IMAGE || tag == ID_IMG) {
0891         o = m_document->docLoader()->requestImage(m_urlToLoad);
0892     } else if (tag == ID_LINK && m_linkIsStyleSheet) {
0893         o = m_document->docLoader()->requestStyleSheet(m_urlToLoad, m_document->part()->encoding());
0894     }
0895
0896     if (o) {
0897         m_document->docLoader()->registerPreload(o);
0898     }
0899
0900     m_urlToLoad = DOMString();
0901     m_linkIsStyleSheet = false;
0902 }
0903
0904 void ProspectiveTokenizer::emitCSSRule()
0905 {
0906     QString rule(m_cssRule.data(), m_cssRule.size());
0907     if (rule.toLower() == "import" && !m_cssRuleValue.isEmpty()) {
0908         DOMString value = DOMString(m_cssRuleValue.data(), m_cssRuleValue.size());
0909         DOMString url = parseURL(value);
0910         if (!url.isEmpty()) {
0911             m_document->docLoader()->registerPreload(m_document->docLoader()->requestStyleSheet(url, m_document->part()->encoding()));    // #### charset
0912         }
0913     }
0914     m_cssRule.clear();
0915     m_cssRuleValue.clear();
0916 }
0917