File indexing completed on 2024-05-05 12:16:48
0001 /** 0002 * This file is part of the DOM implementation for KDE. 0003 * 0004 * Copyright (C) 2000 Peter Kelly (pmk@post.com) 0005 * Copyright (C) 2003 Apple Computer, Inc. 0006 * 0007 * This library is free software; you can redistribute it and/or 0008 * modify it under the terms of the GNU Library General Public 0009 * License as published by the Free Software Foundation; either 0010 * version 2 of the License, or (at your option) any later version. 0011 * 0012 * This library is distributed in the hope that it will be useful, 0013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0015 * Library General Public License for more details. 0016 * 0017 * You should have received a copy of the GNU Library General Public License 0018 * along with this library; see the file COPYING.LIB. If not, write to 0019 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 0020 * Boston, MA 02110-1301, USA. 0021 */ 0022 0023 #include "xml_tokenizer.h" 0024 #include "xml/dom_docimpl.h" 0025 #include "xml/dom_textimpl.h" 0026 #include "xml/dom_xmlimpl.h" 0027 #include "html/html_tableimpl.h" 0028 #include "html/html_headimpl.h" 0029 #include "rendering/render_object.h" 0030 #include "misc/loader.h" 0031 0032 #include "khtmlview.h" 0033 #include "khtml_part.h" 0034 #include <QVariant> 0035 #include <klocalizedstring.h> 0036 #include <kencodingdetector.h> 0037 0038 // SVG includes 0039 #include "svg/SVGScriptElement.h" 0040 #include "svg/XLinkNames.h" 0041 0042 using namespace DOM; 0043 using namespace khtml; 0044 0045 XMLIncrementalSource::XMLIncrementalSource() 0046 : QXmlInputSource(), m_pos(0), m_unicode(nullptr), 0047 m_finished(false), m_paused(false) 0048 { 0049 } 0050 0051 void XMLIncrementalSource::fetchData() 0052 { 0053 //just a dummy to overwrite default behavior 0054 } 0055 0056 QChar XMLIncrementalSource::next() 0057 { 0058 if (m_finished) { 0059 return QXmlInputSource::EndOfDocument; 0060 } else if (m_paused || m_data.length() <= m_pos) { 0061 return QXmlInputSource::EndOfData; 0062 } else { 0063 return m_unicode[m_pos++]; 0064 } 0065 } 0066 0067 void XMLIncrementalSource::setData(const QString &str) 0068 { 0069 m_data = str; 0070 m_unicode = m_data.unicode(); 0071 m_pos = 0; 0072 if (!str.isEmpty()) { 0073 m_finished = false; 0074 } 0075 } 0076 void XMLIncrementalSource::setData(const QByteArray &data) 0077 { 0078 setData(fromRawData(data, true)); 0079 } 0080 0081 void XMLIncrementalSource::appendXML(const QString &str) 0082 { 0083 m_data += str; 0084 m_unicode = m_data.unicode(); 0085 } 0086 0087 QString XMLIncrementalSource::data() const 0088 { 0089 return m_data; 0090 } 0091 0092 void XMLIncrementalSource::setFinished(bool finished) 0093 { 0094 m_finished = finished; 0095 } 0096 0097 XMLHandler::XMLHandler(DocumentImpl *_doc, KHTMLView *_view) 0098 : errorLine(-1) 0099 { 0100 m_doc = _doc; 0101 m_view = _view; 0102 pushNode(_doc); 0103 } 0104 0105 XMLHandler::~XMLHandler() 0106 { 0107 } 0108 0109 void XMLHandler::pushNode(NodeImpl *node) 0110 { 0111 m_nodes.push(node); 0112 } 0113 0114 NodeImpl *XMLHandler::popNode() 0115 { 0116 return m_nodes.pop(); 0117 } 0118 0119 NodeImpl *XMLHandler::currentNode() const 0120 { 0121 if (m_nodes.isEmpty()) { 0122 return nullptr; 0123 } else { 0124 return m_nodes.top(); 0125 } 0126 } 0127 0128 QString XMLHandler::errorProtocol() 0129 { 0130 return errorProt; 0131 } 0132 0133 bool XMLHandler::startDocument() 0134 { 0135 // at the beginning of parsing: do some initialization 0136 errorProt = ""; 0137 state = StateInit; 0138 0139 return true; 0140 } 0141 0142 bool XMLHandler::startPrefixMapping(const QString &prefix, const QString &uri) 0143 { 0144 namespaceInfo[prefix].push(uri); 0145 return true; 0146 } 0147 0148 bool XMLHandler::endPrefixMapping(const QString &prefix) 0149 { 0150 if (namespaceInfo.contains(prefix)) { 0151 QStack<QString> &stack = namespaceInfo[prefix]; 0152 stack.pop(); 0153 if (stack.isEmpty()) { 0154 namespaceInfo.remove(prefix); 0155 } 0156 return true; 0157 } else { 0158 return false; 0159 } 0160 } 0161 0162 void XMLHandler::fixUpNSURI(QString &uri, const QString &qname) 0163 { 0164 /* QXml does not resolve the namespaces of attributes in the same 0165 tag that preceed the xmlns declaration. This fixes up that case */ 0166 if (uri.isEmpty() && qname.indexOf(':') != -1) { 0167 QXmlNamespaceSupport ns; 0168 QString localName, prefix; 0169 ns.splitName(qname, prefix, localName); 0170 if (namespaceInfo.contains(prefix)) { 0171 uri = namespaceInfo[prefix].top(); 0172 } 0173 } 0174 } 0175 0176 bool XMLHandler::startElement(const QString &namespaceURI, const QString & /*localName*/, 0177 const QString &qName, const QXmlAttributes &atts) 0178 { 0179 if (currentNode()->nodeType() == Node::TEXT_NODE) { 0180 exitText(); 0181 } 0182 0183 DOMString nsURI; 0184 if (!namespaceURI.isNull()) { 0185 nsURI = DOMString(namespaceURI); 0186 } else 0187 // No namespace declared, default to the no namespace 0188 { 0189 nsURI = DOMString(""); 0190 } 0191 ElementImpl *newElement = m_doc->createElementNS(nsURI, qName); 0192 if (!newElement) { 0193 return false; 0194 } 0195 int i; 0196 for (i = 0; i < atts.length(); i++) { 0197 int exceptioncode = 0; 0198 QString uriString = atts.uri(i); 0199 QString qnString = atts.qName(i); 0200 fixUpNSURI(uriString, qnString); 0201 DOMString uri(uriString); 0202 DOMString qn(qnString); 0203 DOMString val(atts.value(i)); 0204 newElement->setAttributeNS(uri, qn, val, exceptioncode); 0205 if (exceptioncode) { // exception setting attributes 0206 return false; 0207 } 0208 } 0209 0210 if (newElement->id() == ID_SCRIPT || newElement->id() == makeId(xhtmlNamespace, ID_SCRIPT)) { 0211 static_cast<HTMLScriptElementImpl *>(newElement)->setCreatedByParser(true); 0212 } 0213 0214 //this is tricky. in general the node doesn't have to attach to the one it's in. as far 0215 //as standards go this is wrong, but there's literally thousands of documents where 0216 //we see <p><ul>...</ul></p>. the following code is there for those cases. 0217 //when we can't attach to the currently holding us node we try to attach to its parent 0218 bool attached = false; 0219 for (NodeImpl *current = currentNode(); current; current = current->parent()) { 0220 attached = current->addChild(newElement); 0221 if (attached) { 0222 break; 0223 } 0224 } 0225 if (attached) { 0226 if (m_view && !newElement->attached() && !m_doc->hasPendingSheets()) { 0227 newElement->attach(); 0228 } 0229 pushNode(newElement); 0230 return true; 0231 } else { 0232 delete newElement; 0233 return false; 0234 } 0235 0236 // ### DOM spec states: "if there is no markup inside an element's content, the text is contained in a 0237 // single object implementing the Text interface that is the only child of the element."... do we 0238 // need to ensure that empty elements always have an empty text child? 0239 } 0240 0241 bool XMLHandler::endElement(const QString & /*namespaceURI*/, const QString & /*localName*/, const QString & /*qName*/) 0242 { 0243 if (currentNode()->nodeType() == Node::TEXT_NODE) { 0244 exitText(); 0245 } 0246 0247 NodeImpl *node = popNode(); 0248 if (node) { 0249 node->close(); 0250 while (currentNode() && currentNode()->implicitNode()) { //for the implicit HTMLTableSectionElementImpl 0251 popNode()->close(); 0252 } 0253 } else { 0254 return false; 0255 } 0256 0257 // if the node is a script element try to execute it immediately 0258 if ((node->id() == ID_SCRIPT) || (node->id() == makeId(xhtmlNamespace, ID_SCRIPT)) || node->id() == WebCore::SVGNames::scriptTag.id()) { 0259 static_cast<XMLTokenizer *>(m_doc->tokenizer())->executeScript(node); 0260 } 0261 0262 return true; 0263 } 0264 0265 bool XMLHandler::startCDATA() 0266 { 0267 if (currentNode()->nodeType() == Node::TEXT_NODE) { 0268 exitText(); 0269 } 0270 0271 int exceptioncode = 0; 0272 NodeImpl *newNode = m_doc->createCDATASection(new DOMStringImpl(""), exceptioncode); 0273 if (!exceptioncode && currentNode()->addChild(newNode)) { 0274 if (m_view && !newNode->attached() && !m_doc->hasPendingSheets()) { 0275 newNode->attach(); 0276 } 0277 pushNode(newNode); 0278 return true; 0279 } else { 0280 delete newNode; 0281 return false; 0282 } 0283 0284 } 0285 0286 bool XMLHandler::endCDATA() 0287 { 0288 popNode(); 0289 Q_ASSERT(currentNode()); 0290 return currentNode(); 0291 } 0292 0293 bool XMLHandler::characters(const QString &ch) 0294 { 0295 if (currentNode()->nodeType() == Node::TEXT_NODE || 0296 currentNode()->nodeType() == Node::CDATA_SECTION_NODE || 0297 enterText()) { 0298 int exceptioncode = 0; 0299 static_cast<TextImpl *>(currentNode())->appendData(ch, exceptioncode); 0300 if (exceptioncode) { 0301 return false; 0302 } 0303 return true; 0304 } else { 0305 // Don't worry about white-space violating DTD 0306 if (ch.trimmed().isEmpty()) { 0307 return true; 0308 } 0309 0310 return false; 0311 } 0312 0313 } 0314 0315 bool XMLHandler::comment(const QString &ch) 0316 { 0317 if (currentNode()->nodeType() == Node::TEXT_NODE) { 0318 exitText(); 0319 } 0320 // ### handle exceptions 0321 currentNode()->addChild(m_doc->createComment(new DOMStringImpl(ch.unicode(), ch.length()))); 0322 return true; 0323 } 0324 0325 bool XMLHandler::processingInstruction(const QString &target, const QString &data) 0326 { 0327 if (currentNode()->nodeType() == Node::TEXT_NODE) { 0328 exitText(); 0329 } 0330 0331 // Ignore XML target -- shouldn't be part of the DOM 0332 if (target == "xml") { 0333 return true; 0334 } 0335 0336 // ### handle exceptions 0337 ProcessingInstructionImpl *pi = 0338 m_doc->createProcessingInstruction(target, new DOMStringImpl(data.unicode(), data.length())); 0339 currentNode()->addChild(pi); 0340 pi->checkStyleSheet(); 0341 return true; 0342 } 0343 0344 QString XMLHandler::errorString() const 0345 { 0346 // ### Make better error-messages 0347 return i18n("the document is not in the correct file format"); 0348 } 0349 0350 bool XMLHandler::fatalError(const QXmlParseException &exception) 0351 { 0352 errorProt += i18n("fatal parsing error: %1 in line %2, column %3", 0353 exception.message(), 0354 exception.lineNumber(), 0355 exception.columnNumber()); 0356 0357 errorLine = exception.lineNumber(); 0358 errorCol = exception.columnNumber(); 0359 0360 return false; 0361 } 0362 0363 bool XMLHandler::enterText() 0364 { 0365 NodeImpl *newNode = m_doc->createTextNode(""); 0366 if (currentNode()->addChild(newNode)) { 0367 pushNode(newNode); 0368 return true; 0369 } else { 0370 delete newNode; 0371 return false; 0372 } 0373 } 0374 0375 void XMLHandler::exitText() 0376 { 0377 if (m_view && !currentNode()->attached() && !m_doc->hasPendingSheets()) { 0378 currentNode()->attach(); 0379 } 0380 popNode(); 0381 } 0382 0383 bool XMLHandler::attributeDecl(const QString &/*eName*/, const QString &/*aName*/, const QString &/*type*/, 0384 const QString &/*valueDefault*/, const QString &/*value*/) 0385 { 0386 // qt's xml parser (as of 2.2.3) does not currently give us values for type, valueDefault and 0387 // value. When it does, we can store these somewhere and have default attributes on elements 0388 return true; 0389 } 0390 0391 bool XMLHandler::externalEntityDecl(const QString &/*name*/, const QString &/*publicId*/, const QString &/*systemId*/) 0392 { 0393 // ### insert these too - is there anything special we have to do here? 0394 return true; 0395 } 0396 0397 bool XMLHandler::internalEntityDecl(const QString &name, const QString &value) 0398 { 0399 EntityImpl *e = new EntityImpl(m_doc, name); 0400 // ### further parse entities inside the value and add them as separate nodes (or entityreferences)? 0401 e->addChild(m_doc->createTextNode(new DOMStringImpl(value.unicode(), value.length()))); 0402 if (m_doc->doctype()) { 0403 static_cast<GenericRONamedNodeMapImpl *>(m_doc->doctype()->entities())->addNode(e); 0404 } 0405 return true; 0406 } 0407 0408 bool XMLHandler::notationDecl(const QString &/*name*/, const QString &/*publicId*/, const QString &/*systemId*/) 0409 { 0410 // ### FIXME 0411 // if (m_doc->document()->doctype()) { 0412 // NotationImpl *n = new NotationImpl(m_doc,name,publicId,systemId); 0413 // static_cast<GenericRONamedNodeMapImpl*>(m_doc->document()->doctype()->notations())->addNode(n); 0414 // } 0415 return true; 0416 } 0417 0418 bool XMLHandler::unparsedEntityDecl(const QString &/*name*/, const QString &/*publicId*/, 0419 const QString &/*systemId*/, const QString &/*notationName*/) 0420 { 0421 // ### 0422 return true; 0423 } 0424 0425 bool XMLHandler::startDTD(const QString &name, const QString &publicId, const QString &systemId) 0426 { 0427 int exceptionCode = 0; 0428 SharedPtr<DocumentTypeImpl> docType = m_doc->implementation()->createDocumentType(name, publicId, systemId, exceptionCode); 0429 0430 if (exceptionCode == 0) { 0431 docType->setDocument(m_doc); 0432 m_doc->appendChild(docType.get(), exceptionCode); 0433 } 0434 0435 return (exceptionCode == 0); 0436 } 0437 0438 bool XMLHandler::endDTD() 0439 { 0440 return true; 0441 } 0442 0443 //------------------------------------------------------------------------------ 0444 0445 XMLTokenizer::XMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view) 0446 : m_handler(_doc, _view) 0447 { 0448 m_doc = _doc; 0449 m_view = _view; 0450 m_cachedScript = nullptr; 0451 m_noErrors = true; 0452 m_executingScript = false; 0453 m_explicitFinishParsingNeeded = false; 0454 m_insideWrite = false; 0455 m_reader.setContentHandler(&m_handler); 0456 m_reader.setLexicalHandler(&m_handler); 0457 m_reader.setErrorHandler(&m_handler); 0458 m_reader.setDeclHandler(&m_handler); 0459 m_reader.setDTDHandler(&m_handler); 0460 m_reader.setFeature("http://xml.org/sax/features/namespace-prefixes", true); 0461 } 0462 0463 XMLTokenizer::~XMLTokenizer() 0464 { 0465 if (m_cachedScript) { 0466 m_cachedScript->deref(this); 0467 } 0468 } 0469 0470 void XMLTokenizer::begin() 0471 { 0472 // parse xml file 0473 m_reader.parse(&m_source, true); 0474 } 0475 0476 void XMLTokenizer::write(const TokenizerString &str, bool appendData) 0477 { 0478 if (!m_noErrors && appendData) { 0479 return; 0480 } 0481 0482 // check if we try to re-enter inside write() 0483 // if so buffer the data 0484 if (m_insideWrite) { 0485 m_bufferedData.append(str.toString()); 0486 return; 0487 } 0488 m_insideWrite = true; 0489 0490 if (appendData) { 0491 m_source.appendXML(str.toString()); 0492 0493 } else { 0494 m_source.setData(str.toString()); 0495 } 0496 m_noErrors = m_reader.parseContinue(); 0497 0498 if (m_doc->decoder() && m_doc->decoder()->decodedInvalidCharacters()) { 0499 // any invalid character spotted by the decoder is fatal, per XML 1.0 spec. Tested by Acid 3 - 70 0500 m_handler.fatalError(QXmlParseException(m_handler.errorString())); // ### FIXME: make that more informative after string freeze : i18n("input stream contains invalid characters") 0501 m_noErrors = false; 0502 finish(); 0503 return; 0504 } 0505 0506 // check if while parsing we tried to re-enter write() method so now we have some buffered data we need to write to document 0507 while (m_noErrors && !m_bufferedData.isEmpty()) { 0508 m_source.appendXML(m_bufferedData); 0509 m_bufferedData.clear(); 0510 m_noErrors = m_reader.parseContinue(); 0511 } 0512 // check if we need to call finish explicitly (see XMLTokenizer::finish() comment for details) 0513 if (m_explicitFinishParsingNeeded) { 0514 finish(); 0515 } 0516 m_insideWrite = false; 0517 } 0518 0519 void XMLTokenizer::end() 0520 { 0521 m_source.setFinished(true); 0522 //if ( m_noErrors ) 0523 //m_noErrors = m_reader.parseContinue(); 0524 emit finishedParsing(); 0525 } 0526 0527 void XMLTokenizer::finish() 0528 { 0529 if (m_executingScript) { 0530 // still executing script, it can happen because of reentrancy, e.g. when we have alert() inside script and we got the rest of the data 0531 m_explicitFinishParsingNeeded = true; 0532 return; 0533 } 0534 m_source.setFinished(true); 0535 if (!m_noErrors) { 0536 // An error occurred during parsing of the code. Display an error page to the user (the DOM 0537 // tree is created manually and includes an excerpt from the code where the error is located) 0538 0539 // ### for multiple error messages, display the code for each (can this happen?) 0540 0541 // Clear the document 0542 int exceptioncode = 0; 0543 while (m_doc->hasChildNodes()) { 0544 static_cast<NodeImpl *>(m_doc)->removeChild(m_doc->firstChild(), exceptioncode); 0545 } 0546 0547 QString line, errorLocPtr; 0548 if (m_handler.errorLine != -1) { 0549 QString xmlCode = m_source.data(); 0550 QTextStream stream(&xmlCode, QIODevice::ReadOnly); 0551 for (int lineno = 0; lineno < m_handler.errorLine - 1; lineno++) { 0552 stream.readLine(); 0553 } 0554 line = stream.readLine(); 0555 0556 for (long colno = 0; colno < m_handler.errorCol - 1; colno++) { 0557 errorLocPtr += ' '; 0558 } 0559 errorLocPtr += '^'; 0560 } 0561 0562 // Create elements for display 0563 DocumentImpl *doc = m_doc; 0564 NodeImpl *html = doc->createElementNS(XHTML_NAMESPACE, "html"); 0565 NodeImpl *body = doc->createElementNS(XHTML_NAMESPACE, "body"); 0566 NodeImpl *h1 = doc->createElementNS(XHTML_NAMESPACE, "h1"); 0567 NodeImpl *headingText = doc->createTextNode(i18n("XML parsing error")); 0568 NodeImpl *errorText = doc->createTextNode(m_handler.errorProtocol()); 0569 NodeImpl *hr = nullptr; 0570 NodeImpl *pre = nullptr; 0571 NodeImpl *lineText = nullptr; 0572 NodeImpl *errorLocText = nullptr; 0573 if (!line.isNull()) { 0574 hr = doc->createElementNS(XHTML_NAMESPACE, "hr"); 0575 pre = doc->createElementNS(XHTML_NAMESPACE, "pre"); 0576 lineText = doc->createTextNode(line + '\n'); 0577 errorLocText = doc->createTextNode(errorLocPtr); 0578 } 0579 0580 // Construct DOM tree. We ignore exceptions as we assume they will not be thrown here (due to the 0581 // fact we are using a known tag set) 0582 doc->appendChild(html, exceptioncode); 0583 html->appendChild(body, exceptioncode); 0584 body->appendChild(h1, exceptioncode); 0585 h1->appendChild(headingText, exceptioncode); 0586 body->appendChild(errorText, exceptioncode); 0587 body->appendChild(hr, exceptioncode); 0588 body->appendChild(pre, exceptioncode); 0589 if (pre) { 0590 pre->appendChild(lineText, exceptioncode); 0591 pre->appendChild(errorLocText, exceptioncode); 0592 } 0593 0594 // Close the renderers so that they update their display correctly 0595 // ### this should not be necessary, but requires changes in the rendering code... 0596 h1->close(); 0597 if (pre) { 0598 pre->close(); 0599 } 0600 body->close(); 0601 0602 m_doc->recalcStyle(NodeImpl::Inherit); 0603 m_doc->updateRendering(); 0604 } else { 0605 // Parsing was successful, all scripts have finished downloading and executing, 0606 // calculating the style for the document and close the last element 0607 m_doc->updateStyleSelector(); 0608 } 0609 0610 // finished parsing, call end() 0611 end(); 0612 } 0613 0614 void XMLTokenizer::notifyFinished(CachedObject *finishedObj) 0615 { 0616 // This is called when a script has finished loading that was requested from executeScript(). We execute 0617 // the script, and then continue parsing of the document 0618 if (finishedObj == m_cachedScript) { 0619 DOMString scriptSource = m_cachedScript->script(); 0620 m_cachedScript->deref(this); 0621 m_cachedScript = nullptr; 0622 if (m_view) { 0623 m_executingScript = true; 0624 m_view->part()->executeScript(DOM::Node(), scriptSource.string()); 0625 m_executingScript = false; 0626 } 0627 // should continue parsing here after we fetched and executed the script 0628 m_source.setPaused(false); 0629 m_reader.parseContinue(); 0630 } 0631 } 0632 0633 bool XMLTokenizer::isWaitingForScripts() const 0634 { 0635 return m_cachedScript != nullptr; 0636 } 0637 0638 void XMLTokenizer::executeScript(NodeImpl *node) 0639 { 0640 ElementImpl *script = static_cast<ElementImpl *>(node); 0641 DOMString scriptSrc; 0642 if (node->id() == WebCore::SVGNames::scriptTag.id()) { 0643 scriptSrc = script->getAttribute(WebCore::XLinkNames::hrefAttr.id()); 0644 } else { 0645 scriptSrc = script->getAttribute(ATTR_SRC); 0646 } 0647 0648 QString charset = script->getAttribute(ATTR_CHARSET).string(); 0649 0650 if (!scriptSrc.isEmpty()) { 0651 // we have a src attribute 0652 m_cachedScript = m_doc->docLoader()->requestScript(scriptSrc, charset); 0653 if (m_cachedScript) { 0654 // pause parsing until we got script 0655 m_source.setPaused(); 0656 m_cachedScript->ref(this); // the parsing will be continued once the script is fetched and executed in notifyFinished() 0657 return; 0658 } 0659 } else { 0660 // no src attribute - execute from contents of tag 0661 QString scriptCode = ""; 0662 NodeImpl *child; 0663 for (child = script->firstChild(); child; child = child->nextSibling()) { 0664 if ((child->nodeType() == Node::TEXT_NODE || child->nodeType() == Node::CDATA_SECTION_NODE) && 0665 static_cast<TextImpl *>(child)->string()) 0666 scriptCode += QString::fromRawData(static_cast<TextImpl *>(child)->string()->s, 0667 static_cast<TextImpl *>(child)->string()->l); 0668 } 0669 // the script cannot do document.write until we support incremental parsing 0670 // ### handle the case where the script deletes the node or redirects to 0671 // another page, etc. (also in notifyFinished()) 0672 // ### the script may add another script node after this one which should be executed 0673 if (m_view) { 0674 m_executingScript = true; 0675 m_view->part()->executeScript(DOM::Node(), scriptCode); 0676 m_executingScript = false; 0677 } 0678 } 0679 } 0680 0681 #include "moc_xml_tokenizer.cpp"