File indexing completed on 2024-06-23 04:03:33
0001 /* 0002 * parser.cpp - parse an XMPP "document" 0003 * Copyright (C) 2003 Justin Karneges 0004 * 0005 * This library is free software; you can redistribute it and/or 0006 * modify it under the terms of the GNU Lesser General Public 0007 * License as published by the Free Software Foundation; either 0008 * either version 2 0009 of the License, or (at your option) any later version.1 of the License, or (at your option) any later version. 0010 * 0011 * This library is distributed in the hope that it will be useful, 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 0014 * Lesser General Public License for more details. 0015 * 0016 * You should have received a copy of the GNU Lesser General Public 0017 * License along with this library; if not, write to the Free Software 0018 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 0019 * 0020 */ 0021 0022 /* 0023 TODO: 0024 0025 For XMPP::Parser to be "perfect", some things must be solved/changed in the 0026 Qt library: 0027 0028 - Fix weird QDomElement::haveAttributeNS() bug (patch submitted to 0029 Trolltech on Aug 31st, 2003). 0030 - Fix weird behavior in QXmlSimpleReader of reporting endElement() when 0031 the '/' character of a self-closing tag is reached, instead of when 0032 the final '>' is reached. 0033 - Fix incremental parsing bugs in QXmlSimpleReader. At the moment, the 0034 only bug I've found is related to attribute parsing, but there might 0035 be more (search for '###' in $QTDIR/src/xml/qxml.cpp). 0036 0037 We have workarounds for all of the above problems in the code below. 0038 0039 - Deal with the <?xml?> processing instruction as an event type, so that we 0040 can feed it back to the application properly. Right now it is completely 0041 untrackable and is simply tacked into the first event's actualString. We 0042 can't easily do this because QXmlSimpleReader eats an extra byte beyond 0043 the processing instruction before reporting it. 0044 0045 - Make QXmlInputSource capable of accepting data incrementally, to ensure 0046 proper text encoding detection and processing over a network. This is 0047 technically not a bug, as we have our own subclass below to do it, but 0048 it would be nice if Qt had this already. 0049 */ 0050 0051 #include "parser.h" 0052 0053 #include <QTextCodec> 0054 #include <string.h> 0055 0056 using namespace XMPP; 0057 0058 static bool qt_bug_check = false; 0059 static bool qt_bug_have; 0060 0061 //---------------------------------------------------------------------------- 0062 // StreamInput 0063 //---------------------------------------------------------------------------- 0064 class StreamInput : public QXmlInputSource 0065 { 0066 public: 0067 StreamInput() 0068 { 0069 dec = 0; 0070 reset(); 0071 } 0072 0073 ~StreamInput() override 0074 { 0075 delete dec; 0076 } 0077 0078 void reset() override 0079 { 0080 delete dec; 0081 dec = 0; 0082 in.resize(0); 0083 out = ""; 0084 at = 0; 0085 paused = false; 0086 mightChangeEncoding = true; 0087 checkBad = true; 0088 last = QChar(); 0089 v_encoding = ""; 0090 resetLastData(); 0091 } 0092 0093 void resetLastData() 0094 { 0095 last_string = ""; 0096 } 0097 0098 QString lastString() const 0099 { 0100 return last_string; 0101 } 0102 0103 void appendData(const QByteArray &a) 0104 { 0105 int oldsize = in.size(); 0106 in.resize(oldsize + a.size()); 0107 memcpy(in.data() + oldsize, a.data(), a.size()); 0108 processBuf(); 0109 } 0110 0111 QChar lastRead() 0112 { 0113 return last; 0114 } 0115 0116 QChar next() override 0117 { 0118 if(paused) 0119 return EndOfData; 0120 else 0121 return readNext(); 0122 } 0123 0124 // NOTE: setting 'peek' to true allows the same char to be read again, 0125 // however this still advances the internal byte processing. 0126 QChar readNext(bool peek=false) 0127 { 0128 QChar c; 0129 if(mightChangeEncoding) 0130 c = EndOfData; 0131 else { 0132 if(out.isEmpty()) { 0133 QString s; 0134 if(!tryExtractPart(&s)) 0135 c = EndOfData; 0136 else { 0137 out = s; 0138 c = out[0]; 0139 } 0140 } 0141 else 0142 c = out[0]; 0143 if(!peek) 0144 out.remove(0, 1); 0145 } 0146 if(c == EndOfData) { 0147 #ifdef XMPP_PARSER_DEBUG 0148 printf("next() = EOD\n"); 0149 #endif 0150 } 0151 else { 0152 #ifdef XMPP_PARSER_DEBUG 0153 printf("next() = [%c]\n", c.latin1()); 0154 #endif 0155 last = c; 0156 } 0157 0158 return c; 0159 } 0160 0161 QByteArray unprocessed() const 0162 { 0163 QByteArray a; 0164 a.resize(in.size() - at); 0165 memcpy(a.data(), in.data() + at, a.size()); 0166 return a; 0167 } 0168 0169 void pause(bool b) 0170 { 0171 paused = b; 0172 } 0173 0174 bool isPaused() 0175 { 0176 return paused; 0177 } 0178 0179 QString encoding() const 0180 { 0181 return v_encoding; 0182 } 0183 0184 private: 0185 QTextDecoder *dec; 0186 QByteArray in; 0187 QString out; 0188 int at; 0189 bool paused; 0190 bool mightChangeEncoding; 0191 QChar last; 0192 QString v_encoding; 0193 QString last_string; 0194 bool checkBad; 0195 0196 void processBuf() 0197 { 0198 #ifdef XMPP_PARSER_DEBUG 0199 printf("processing. size=%d, at=%d\n", in.size(), at); 0200 #endif 0201 if(!dec) { 0202 QTextCodec *codec = 0; 0203 uchar *p = (uchar *)in.data() + at; 0204 int size = in.size() - at; 0205 0206 // do we have enough information to determine the encoding? 0207 if(size == 0) 0208 return; 0209 bool utf16 = false; 0210 if(p[0] == 0xfe || p[0] == 0xff) { 0211 // probably going to be a UTF-16 byte order mark 0212 if(size < 2) 0213 return; 0214 if((p[0] == 0xfe && p[1] == 0xff) || (p[0] == 0xff && p[1] == 0xfe)) { 0215 // ok it is UTF-16 0216 utf16 = true; 0217 } 0218 } 0219 if(utf16) 0220 codec = QTextCodec::codecForMib(1000); // UTF-16 0221 else 0222 codec = QTextCodec::codecForMib(106); // UTF-8 0223 0224 v_encoding = codec->name(); 0225 dec = codec->makeDecoder(); 0226 0227 // for utf16, put in the byte order mark 0228 if(utf16) { 0229 out += dec->toUnicode((const char *)p, 2); 0230 at += 2; 0231 } 0232 } 0233 0234 if(mightChangeEncoding) { 0235 while(1) { 0236 int n = out.indexOf('<'); 0237 if(n != -1) { 0238 // we need a closing bracket 0239 int n2 = out.indexOf('>', n); 0240 if(n2 != -1) { 0241 ++n2; 0242 QString h = out.mid(n, n2-n); 0243 QString enc = processXmlHeader(h); 0244 QTextCodec *codec = 0; 0245 if(!enc.isEmpty()) 0246 codec = QTextCodec::codecForName(enc.toLatin1()); 0247 0248 // changing codecs 0249 if(codec) { 0250 v_encoding = codec->name(); 0251 delete dec; 0252 dec = codec->makeDecoder(); 0253 } 0254 mightChangeEncoding = false; 0255 out.truncate(0); 0256 at = 0; 0257 resetLastData(); 0258 break; 0259 } 0260 } 0261 QString s; 0262 if(!tryExtractPart(&s)) 0263 break; 0264 if(checkBad && checkForBadChars(s)) { 0265 // go to the parser 0266 mightChangeEncoding = false; 0267 out.truncate(0); 0268 at = 0; 0269 resetLastData(); 0270 break; 0271 } 0272 out += s; 0273 } 0274 } 0275 } 0276 0277 QString processXmlHeader(const QString &h) 0278 { 0279 if(h.left(5) != "<?xml") 0280 return ""; 0281 0282 int endPos = h.indexOf(">"); 0283 int startPos = h.indexOf("encoding"); 0284 if(startPos < endPos && startPos != -1) { 0285 QString encoding; 0286 do { 0287 startPos++; 0288 if(startPos > endPos) { 0289 return ""; 0290 } 0291 } while(h[startPos] != '"' && h[startPos] != '\''); 0292 startPos++; 0293 while(h[startPos] != '"' && h[startPos] != '\'') { 0294 encoding += h[startPos]; 0295 startPos++; 0296 if(startPos > endPos) { 0297 return ""; 0298 } 0299 } 0300 return encoding; 0301 } 0302 else 0303 return ""; 0304 } 0305 0306 bool tryExtractPart(QString *s) 0307 { 0308 int size = in.size() - at; 0309 if(size == 0) 0310 return false; 0311 uchar *p = (uchar *)in.data() + at; 0312 QString nextChars; 0313 while(1) { 0314 nextChars = dec->toUnicode((const char *)p, 1); 0315 ++p; 0316 ++at; 0317 if(!nextChars.isEmpty()) 0318 break; 0319 if(at == (int)in.size()) 0320 return false; 0321 } 0322 last_string += nextChars; 0323 *s = nextChars; 0324 0325 // free processed data? 0326 if(at >= 1024) { 0327 char *p = in.data(); 0328 int size = in.size() - at; 0329 memmove(p, p + at, size); 0330 in.resize(size); 0331 at = 0; 0332 } 0333 0334 return true; 0335 } 0336 0337 bool checkForBadChars(const QString &s) 0338 { 0339 int len = s.indexOf('<'); 0340 if(len == -1) 0341 len = s.length(); 0342 else 0343 checkBad = false; 0344 for(int n = 0; n < len; ++n) { 0345 if(!s.at(n).isSpace()) 0346 return true; 0347 } 0348 return false; 0349 } 0350 }; 0351 0352 0353 //---------------------------------------------------------------------------- 0354 // ParserHandler 0355 //---------------------------------------------------------------------------- 0356 namespace XMPP 0357 { 0358 class ParserHandler : public QXmlDefaultHandler 0359 { 0360 public: 0361 ParserHandler(StreamInput *_in, QDomDocument *_doc) 0362 { 0363 in = _in; 0364 doc = _doc; 0365 needMore = false; 0366 } 0367 0368 ~ParserHandler() override 0369 { 0370 while (!eventList.isEmpty()) { 0371 delete eventList.takeFirst(); 0372 } 0373 } 0374 0375 bool startDocument() override 0376 { 0377 depth = 0; 0378 return true; 0379 } 0380 0381 bool endDocument() override 0382 { 0383 return true; 0384 } 0385 0386 bool startPrefixMapping(const QString &prefix, const QString &uri) override 0387 { 0388 if(depth == 0) { 0389 nsnames += prefix; 0390 nsvalues += uri; 0391 } 0392 return true; 0393 } 0394 0395 bool startElement(const QString &namespaceURI, const QString &localName, const QString &qName, const QXmlAttributes &atts) override 0396 { 0397 if(depth == 0) { 0398 Parser::Event *e = new Parser::Event; 0399 QXmlAttributes a; 0400 for(int n = 0; n < atts.length(); ++n) { 0401 QString uri = atts.uri(n); 0402 QString ln = atts.localName(n); 0403 if(a.index(uri, ln) == -1) 0404 a.append(atts.qName(n), uri, ln, atts.value(n)); 0405 } 0406 e->setDocumentOpen(namespaceURI, localName, qName, a, nsnames, nsvalues); 0407 nsnames.clear(); 0408 nsvalues.clear(); 0409 e->setActualString(in->lastString()); 0410 0411 in->resetLastData(); 0412 eventList.append(e); 0413 in->pause(true); 0414 } 0415 else { 0416 QDomElement e = doc->createElementNS(namespaceURI, qName); 0417 for(int n = 0; n < atts.length(); ++n) { 0418 QString uri = atts.uri(n); 0419 QString ln = atts.localName(n); 0420 bool have; 0421 if(!uri.isEmpty()) { 0422 have = e.hasAttributeNS(uri, ln); 0423 if(qt_bug_have) 0424 have = !have; 0425 } 0426 else 0427 have = e.hasAttribute(ln); 0428 if(!have) 0429 e.setAttributeNS(uri, atts.qName(n), atts.value(n)); 0430 } 0431 0432 if(depth == 1) { 0433 elem = e; 0434 current = e; 0435 } 0436 else { 0437 current.appendChild(e); 0438 current = e; 0439 } 0440 } 0441 ++depth; 0442 return true; 0443 } 0444 0445 bool endElement(const QString &namespaceURI, const QString &localName, const QString &qName) override 0446 { 0447 --depth; 0448 if(depth == 0) { 0449 Parser::Event *e = new Parser::Event; 0450 e->setDocumentClose(namespaceURI, localName, qName); 0451 e->setActualString(in->lastString()); 0452 in->resetLastData(); 0453 eventList.append(e); 0454 in->pause(true); 0455 } 0456 else { 0457 // done with a depth 1 element? 0458 if(depth == 1) { 0459 Parser::Event *e = new Parser::Event; 0460 e->setElement(elem); 0461 e->setActualString(in->lastString()); 0462 in->resetLastData(); 0463 eventList.append(e); 0464 in->pause(true); 0465 0466 elem = QDomElement(); 0467 current = QDomElement(); 0468 } 0469 else 0470 current = current.parentNode().toElement(); 0471 } 0472 0473 if(in->lastRead() == '/') 0474 checkNeedMore(); 0475 0476 return true; 0477 } 0478 0479 bool characters(const QString &str) override 0480 { 0481 if(depth >= 1) { 0482 QString content = str; 0483 if(content.isEmpty()) 0484 return true; 0485 0486 if(!current.isNull()) { 0487 QDomText text = doc->createTextNode(content); 0488 current.appendChild(text); 0489 } 0490 } 0491 return true; 0492 } 0493 0494 /*bool processingInstruction(const QString &target, const QString &data) 0495 { 0496 printf("Processing: [%s], [%s]\n", target.latin1(), data.latin1()); 0497 in->resetLastData(); 0498 return true; 0499 }*/ 0500 0501 void checkNeedMore() 0502 { 0503 // Here we will work around QXmlSimpleReader strangeness and self-closing tags. 0504 // The problem is that endElement() is called when the '/' is read, not when 0505 // the final '>' is read. This is a potential problem when obtaining unprocessed 0506 // bytes from StreamInput after this event, as the '>' character will end up 0507 // in the unprocessed chunk. To work around this, we need to advance StreamInput's 0508 // internal byte processing, but not the xml character data. This way, the '>' 0509 // will get processed and will no longer be in the unprocessed return, but 0510 // QXmlSimpleReader can still read it. To do this, we call StreamInput::readNext 0511 // with 'peek' mode. 0512 QChar c = in->readNext(true); // peek 0513 if(c == QXmlInputSource::EndOfData) { 0514 needMore = true; 0515 } 0516 else { 0517 // We'll assume the next char is a '>'. If it isn't, then 0518 // QXmlSimpleReader will deal with that problem on the next 0519 // parse. We don't need to take any action here. 0520 needMore = false; 0521 0522 // there should have been a pending event 0523 if (!eventList.isEmpty()) { 0524 Parser::Event *e = eventList.first(); 0525 e->setActualString(e->actualString() + '>'); 0526 in->resetLastData(); 0527 } 0528 } 0529 } 0530 0531 Parser::Event *takeEvent() 0532 { 0533 if(needMore) 0534 return 0; 0535 if(eventList.isEmpty()) 0536 return 0; 0537 0538 Parser::Event *e = eventList.takeFirst(); 0539 in->pause(false); 0540 return e; 0541 } 0542 0543 StreamInput *in; 0544 QDomDocument *doc; 0545 int depth; 0546 QStringList nsnames, nsvalues; 0547 QDomElement elem, current; 0548 QList<Parser::Event*> eventList; 0549 bool needMore; 0550 }; 0551 } 0552 0553 0554 //---------------------------------------------------------------------------- 0555 // Event 0556 //---------------------------------------------------------------------------- 0557 class Parser::Event::Private 0558 { 0559 public: 0560 int type; 0561 QString ns, ln, qn; 0562 QXmlAttributes a; 0563 QDomElement e; 0564 QString str; 0565 QStringList nsnames, nsvalues; 0566 }; 0567 0568 Parser::Event::Event() 0569 { 0570 d = 0; 0571 } 0572 0573 Parser::Event::Event(const Event &from) 0574 { 0575 d = 0; 0576 *this = from; 0577 } 0578 0579 Parser::Event & Parser::Event::operator=(const Event &from) 0580 { 0581 delete d; 0582 d = 0; 0583 if(from.d) 0584 d = new Private(*from.d); 0585 return *this; 0586 } 0587 0588 Parser::Event::~Event() 0589 { 0590 delete d; 0591 } 0592 0593 bool Parser::Event::isNull() const 0594 { 0595 return (d ? false: true); 0596 } 0597 0598 int Parser::Event::type() const 0599 { 0600 if(isNull()) 0601 return -1; 0602 return d->type; 0603 } 0604 0605 QString Parser::Event::nsprefix(const QString &s) const 0606 { 0607 QStringList::ConstIterator it = d->nsnames.constBegin(); 0608 QStringList::ConstIterator it2 = d->nsvalues.constBegin(); 0609 for(; it != d->nsnames.constEnd(); ++it) { 0610 if((*it) == s) 0611 return (*it2); 0612 ++it2; 0613 } 0614 return QString(); 0615 } 0616 0617 QString Parser::Event::namespaceURI() const 0618 { 0619 return d->ns; 0620 } 0621 0622 QString Parser::Event::localName() const 0623 { 0624 return d->ln; 0625 } 0626 0627 QString Parser::Event::qName() const 0628 { 0629 return d->qn; 0630 } 0631 0632 QXmlAttributes Parser::Event::atts() const 0633 { 0634 return d->a; 0635 } 0636 0637 QString Parser::Event::actualString() const 0638 { 0639 return d->str; 0640 } 0641 0642 QDomElement Parser::Event::element() const 0643 { 0644 return d->e; 0645 } 0646 0647 void Parser::Event::setDocumentOpen(const QString &namespaceURI, const QString &localName, const QString &qName, const QXmlAttributes &atts, const QStringList &nsnames, const QStringList &nsvalues) 0648 { 0649 if(!d) 0650 d = new Private; 0651 d->type = DocumentOpen; 0652 d->ns = namespaceURI; 0653 d->ln = localName; 0654 d->qn = qName; 0655 d->a = atts; 0656 d->nsnames = nsnames; 0657 d->nsvalues = nsvalues; 0658 } 0659 0660 void Parser::Event::setDocumentClose(const QString &namespaceURI, const QString &localName, const QString &qName) 0661 { 0662 if(!d) 0663 d = new Private; 0664 d->type = DocumentClose; 0665 d->ns = namespaceURI; 0666 d->ln = localName; 0667 d->qn = qName; 0668 } 0669 0670 void Parser::Event::setElement(const QDomElement &elem) 0671 { 0672 if(!d) 0673 d = new Private; 0674 d->type = Element; 0675 d->e = elem; 0676 } 0677 0678 void Parser::Event::setError() 0679 { 0680 if(!d) 0681 d = new Private; 0682 d->type = Error; 0683 } 0684 0685 void Parser::Event::setActualString(const QString &str) 0686 { 0687 d->str = str; 0688 } 0689 0690 //---------------------------------------------------------------------------- 0691 // Parser 0692 //---------------------------------------------------------------------------- 0693 class Parser::Private 0694 { 0695 public: 0696 Private() 0697 { 0698 doc = 0; 0699 in = 0; 0700 handler = 0; 0701 reader = 0; 0702 reset(); 0703 } 0704 0705 ~Private() 0706 { 0707 reset(false); 0708 } 0709 0710 void reset(bool create=true) 0711 { 0712 delete reader; 0713 delete handler; 0714 delete in; 0715 delete doc; 0716 0717 if(create) { 0718 doc = new QDomDocument; 0719 in = new StreamInput; 0720 handler = new ParserHandler(in, doc); 0721 reader = new QXmlSimpleReader; 0722 reader->setContentHandler(handler); 0723 0724 // initialize the reader 0725 in->pause(true); 0726 reader->parse(in, true); 0727 in->pause(false); 0728 } 0729 } 0730 0731 QDomDocument *doc; 0732 StreamInput *in; 0733 ParserHandler *handler; 0734 QXmlSimpleReader *reader; 0735 }; 0736 0737 Parser::Parser() 0738 { 0739 d = new Private; 0740 0741 // check for evil bug in Qt <= 3.2.1 0742 if(!qt_bug_check) { 0743 qt_bug_check = true; 0744 QDomElement e = d->doc->createElementNS("someuri", "somename"); 0745 if(e.hasAttributeNS("someuri", "somename")) 0746 qt_bug_have = true; 0747 else 0748 qt_bug_have = false; 0749 } 0750 } 0751 0752 Parser::~Parser() 0753 { 0754 delete d; 0755 } 0756 0757 void Parser::reset() 0758 { 0759 d->reset(); 0760 } 0761 0762 void Parser::appendData(const QByteArray &a) 0763 { 0764 d->in->appendData(a); 0765 0766 // if handler was waiting for more, give it a kick 0767 if(d->handler->needMore) 0768 d->handler->checkNeedMore(); 0769 } 0770 0771 Parser::Event Parser::readNext() 0772 { 0773 Event e; 0774 if(d->handler->needMore) 0775 return e; 0776 Event *ep = d->handler->takeEvent(); 0777 if(!ep) { 0778 if(!d->reader->parseContinue()) { 0779 e.setError(); 0780 return e; 0781 } 0782 ep = d->handler->takeEvent(); 0783 if(!ep) 0784 return e; 0785 } 0786 e = *ep; 0787 delete ep; 0788 return e; 0789 } 0790 0791 QByteArray Parser::unprocessed() const 0792 { 0793 return d->in->unprocessed(); 0794 } 0795 0796 QString Parser::encoding() const 0797 { 0798 return d->in->encoding(); 0799 }