File indexing completed on 2024-06-23 04:03:33

0001 /*
0002  * parser.cpp - parse an XMPP "document"
0003  * Copyright (C) 2003  Justin Karneges
0004  *
0005  * This library is free software; you can redistribute it and/or
0006  * modify it under the terms of the GNU Lesser General Public
0007  * License as published by the Free Software Foundation; either
0008  * either version 2
0009    of the License, or (at your option) any later version.1 of the License, or (at your option) any later version.
0010  *
0011  * This library is distributed in the hope that it will be useful,
0012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
0013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0014  * Lesser General Public License for more details.
0015  *
0016  * You should have received a copy of the GNU Lesser General Public
0017  * License along with this library; if not, write to the Free Software
0018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0019  *
0020  */
0021 
0022 /*
0023   TODO:
0024 
0025   For XMPP::Parser to be "perfect", some things must be solved/changed in the
0026   Qt library:
0027 
0028   - Fix weird QDomElement::haveAttributeNS() bug (patch submitted to
0029     Trolltech on Aug 31st, 2003).
0030   - Fix weird behavior in QXmlSimpleReader of reporting endElement() when
0031     the '/' character of a self-closing tag is reached, instead of when
0032     the final '>' is reached.
0033   - Fix incremental parsing bugs in QXmlSimpleReader.  At the moment, the
0034     only bug I've found is related to attribute parsing, but there might
0035     be more (search for '###' in $QTDIR/src/xml/qxml.cpp).
0036 
0037   We have workarounds for all of the above problems in the code below.
0038 
0039   - Deal with the <?xml?> processing instruction as an event type, so that we
0040     can feed it back to the application properly.  Right now it is completely
0041     untrackable and is simply tacked into the first event's actualString.  We
0042     can't easily do this because QXmlSimpleReader eats an extra byte beyond
0043     the processing instruction before reporting it.
0044 
0045   - Make QXmlInputSource capable of accepting data incrementally, to ensure
0046     proper text encoding detection and processing over a network.  This is
0047     technically not a bug, as we have our own subclass below to do it, but
0048     it would be nice if Qt had this already.
0049 */
0050 
0051 #include "parser.h"
0052 
0053 #include <QTextCodec>
0054 #include <string.h>
0055 
0056 using namespace XMPP;
0057 
0058 static bool qt_bug_check = false;
0059 static bool qt_bug_have;
0060 
0061 //----------------------------------------------------------------------------
0062 // StreamInput
0063 //----------------------------------------------------------------------------
0064 class StreamInput : public QXmlInputSource
0065 {
0066 public:
0067     StreamInput()
0068     {
0069         dec = 0;
0070         reset();
0071     }
0072 
0073     ~StreamInput() override
0074     {
0075         delete dec;
0076     }
0077 
0078     void reset() override
0079     {
0080         delete dec;
0081         dec = 0;
0082         in.resize(0);
0083         out = "";
0084         at = 0;
0085         paused = false;
0086         mightChangeEncoding = true;
0087         checkBad = true;
0088         last = QChar();
0089         v_encoding = "";
0090         resetLastData();
0091     }
0092 
0093     void resetLastData()
0094     {
0095         last_string = "";
0096     }
0097 
0098     QString lastString() const
0099     {
0100         return last_string;
0101     }
0102 
0103     void appendData(const QByteArray &a)
0104     {
0105         int oldsize = in.size();
0106         in.resize(oldsize + a.size());
0107         memcpy(in.data() + oldsize, a.data(), a.size());
0108         processBuf();
0109     }
0110 
0111     QChar lastRead()
0112     {
0113         return last;
0114     }
0115 
0116     QChar next() override
0117     {
0118         if(paused)
0119             return EndOfData;
0120         else
0121             return readNext();
0122     }
0123 
0124     // NOTE: setting 'peek' to true allows the same char to be read again,
0125     //       however this still advances the internal byte processing.
0126     QChar readNext(bool peek=false)
0127     {
0128         QChar c;
0129         if(mightChangeEncoding)
0130             c = EndOfData;
0131         else {
0132             if(out.isEmpty()) {
0133                 QString s;
0134                 if(!tryExtractPart(&s))
0135                     c = EndOfData;
0136                 else {
0137                     out = s;
0138                     c = out[0];
0139                 }
0140             }
0141             else
0142                 c = out[0];
0143             if(!peek)
0144                 out.remove(0, 1);
0145         }
0146         if(c == EndOfData) {
0147 #ifdef XMPP_PARSER_DEBUG
0148             printf("next() = EOD\n");
0149 #endif
0150         }
0151         else {
0152 #ifdef XMPP_PARSER_DEBUG
0153             printf("next() = [%c]\n", c.latin1());
0154 #endif
0155             last = c;
0156         }
0157 
0158         return c;
0159     }
0160 
0161     QByteArray unprocessed() const
0162     {
0163         QByteArray a;
0164         a.resize(in.size() - at);
0165         memcpy(a.data(), in.data() + at, a.size());
0166         return a;
0167     }
0168 
0169     void pause(bool b)
0170     {
0171         paused = b;
0172     }
0173 
0174     bool isPaused()
0175     {
0176         return paused;
0177     }
0178 
0179     QString encoding() const
0180     {
0181         return v_encoding;
0182     }
0183 
0184 private:
0185     QTextDecoder *dec;
0186     QByteArray in;
0187     QString out;
0188     int at;
0189     bool paused;
0190     bool mightChangeEncoding;
0191     QChar last;
0192     QString v_encoding;
0193     QString last_string;
0194     bool checkBad;
0195 
0196     void processBuf()
0197     {
0198 #ifdef XMPP_PARSER_DEBUG
0199         printf("processing.  size=%d, at=%d\n", in.size(), at);
0200 #endif
0201         if(!dec) {
0202             QTextCodec *codec = 0;
0203             uchar *p = (uchar *)in.data() + at;
0204             int size = in.size() - at;
0205 
0206             // do we have enough information to determine the encoding?
0207             if(size == 0)
0208                 return;
0209             bool utf16 = false;
0210             if(p[0] == 0xfe || p[0] == 0xff) {
0211                 // probably going to be a UTF-16 byte order mark
0212                 if(size < 2)
0213                     return;
0214                 if((p[0] == 0xfe && p[1] == 0xff) || (p[0] == 0xff && p[1] == 0xfe)) {
0215                     // ok it is UTF-16
0216                     utf16 = true;
0217                 }
0218             }
0219             if(utf16)
0220                 codec = QTextCodec::codecForMib(1000); // UTF-16
0221             else
0222                 codec = QTextCodec::codecForMib(106); // UTF-8
0223 
0224             v_encoding = codec->name();
0225             dec = codec->makeDecoder();
0226 
0227             // for utf16, put in the byte order mark
0228             if(utf16) {
0229                 out += dec->toUnicode((const char *)p, 2);
0230                 at += 2;
0231             }
0232         }
0233 
0234         if(mightChangeEncoding) {
0235             while(1) {
0236                 int n = out.indexOf('<');
0237                 if(n != -1) {
0238                     // we need a closing bracket
0239                     int n2 = out.indexOf('>', n);
0240                     if(n2 != -1) {
0241                         ++n2;
0242                         QString h = out.mid(n, n2-n);
0243                         QString enc = processXmlHeader(h);
0244                         QTextCodec *codec = 0;
0245                         if(!enc.isEmpty())
0246                             codec = QTextCodec::codecForName(enc.toLatin1());
0247 
0248                         // changing codecs
0249                         if(codec) {
0250                             v_encoding = codec->name();
0251                             delete dec;
0252                             dec = codec->makeDecoder();
0253                         }
0254                         mightChangeEncoding = false;
0255                         out.truncate(0);
0256                         at = 0;
0257                         resetLastData();
0258                         break;
0259                     }
0260                 }
0261                 QString s;
0262                 if(!tryExtractPart(&s))
0263                     break;
0264                 if(checkBad && checkForBadChars(s)) {
0265                     // go to the parser
0266                     mightChangeEncoding = false;
0267                     out.truncate(0);
0268                     at = 0;
0269                     resetLastData();
0270                     break;
0271                 }
0272                 out += s;
0273             }
0274         }
0275     }
0276 
0277     QString processXmlHeader(const QString &h)
0278     {
0279         if(h.left(5) != "<?xml")
0280             return "";
0281 
0282         int endPos = h.indexOf(">");
0283         int startPos = h.indexOf("encoding");
0284         if(startPos < endPos && startPos != -1) {
0285             QString encoding;
0286             do {
0287                 startPos++;
0288                 if(startPos > endPos) {
0289                     return "";
0290                 }
0291             } while(h[startPos] != '"' && h[startPos] != '\'');
0292             startPos++;
0293             while(h[startPos] != '"' && h[startPos] != '\'') {
0294                 encoding += h[startPos];
0295                 startPos++;
0296                 if(startPos > endPos) {
0297                     return "";
0298                 }
0299             }
0300             return encoding;
0301         }
0302         else
0303             return "";
0304     }
0305 
0306     bool tryExtractPart(QString *s)
0307     {
0308         int size = in.size() - at;
0309         if(size == 0)
0310             return false;
0311         uchar *p = (uchar *)in.data() + at;
0312         QString nextChars;
0313         while(1) {
0314             nextChars = dec->toUnicode((const char *)p, 1);
0315             ++p;
0316             ++at;
0317             if(!nextChars.isEmpty())
0318                 break;
0319             if(at == (int)in.size())
0320                 return false;
0321         }
0322         last_string += nextChars;
0323         *s = nextChars;
0324 
0325         // free processed data?
0326         if(at >= 1024) {
0327             char *p = in.data();
0328             int size = in.size() - at;
0329             memmove(p, p + at, size);
0330             in.resize(size);
0331             at = 0;
0332         }
0333 
0334         return true;
0335     }
0336 
0337     bool checkForBadChars(const QString &s)
0338     {
0339         int len = s.indexOf('<');
0340         if(len == -1)
0341             len = s.length();
0342         else
0343             checkBad = false;
0344         for(int n = 0; n < len; ++n) {
0345             if(!s.at(n).isSpace())
0346                 return true;
0347         }
0348         return false;
0349     }
0350 };
0351 
0352 
0353 //----------------------------------------------------------------------------
0354 // ParserHandler
0355 //----------------------------------------------------------------------------
0356 namespace XMPP
0357 {
0358     class ParserHandler : public QXmlDefaultHandler
0359     {
0360     public:
0361         ParserHandler(StreamInput *_in, QDomDocument *_doc)
0362         {
0363             in = _in;
0364             doc = _doc;
0365             needMore = false;
0366         }
0367 
0368         ~ParserHandler() override
0369         {
0370             while (!eventList.isEmpty()) {
0371                 delete eventList.takeFirst();
0372             }
0373         }
0374 
0375         bool startDocument() override
0376         {
0377             depth = 0;
0378             return true;
0379         }
0380 
0381         bool endDocument() override
0382         {
0383             return true;
0384         }
0385 
0386         bool startPrefixMapping(const QString &prefix, const QString &uri) override
0387         {
0388             if(depth == 0) {
0389                 nsnames += prefix;
0390                 nsvalues += uri;
0391             }
0392             return true;
0393         }
0394 
0395         bool startElement(const QString &namespaceURI, const QString &localName, const QString &qName, const QXmlAttributes &atts) override
0396         {
0397             if(depth == 0) {
0398                 Parser::Event *e = new Parser::Event;
0399                 QXmlAttributes a;
0400                 for(int n = 0; n < atts.length(); ++n) {
0401                     QString uri = atts.uri(n);
0402                     QString ln = atts.localName(n);
0403                     if(a.index(uri, ln) == -1)
0404                         a.append(atts.qName(n), uri, ln, atts.value(n));
0405                 }
0406                 e->setDocumentOpen(namespaceURI, localName, qName, a, nsnames, nsvalues);
0407                 nsnames.clear();
0408                 nsvalues.clear();
0409                 e->setActualString(in->lastString());
0410 
0411                 in->resetLastData();
0412                 eventList.append(e);
0413                 in->pause(true);
0414             }
0415             else {
0416                 QDomElement e = doc->createElementNS(namespaceURI, qName);
0417                 for(int n = 0; n < atts.length(); ++n) {
0418                     QString uri = atts.uri(n);
0419                     QString ln = atts.localName(n);
0420                     bool have;
0421                     if(!uri.isEmpty()) {
0422                         have = e.hasAttributeNS(uri, ln);
0423                         if(qt_bug_have)
0424                             have = !have;
0425                     }
0426                     else
0427                         have = e.hasAttribute(ln);
0428                     if(!have)
0429                         e.setAttributeNS(uri, atts.qName(n), atts.value(n));
0430                 }
0431 
0432                 if(depth == 1) {
0433                     elem = e;
0434                     current = e;
0435                 }
0436                 else {
0437                     current.appendChild(e);
0438                     current = e;
0439                 }
0440             }
0441             ++depth;
0442             return true;
0443         }
0444 
0445         bool endElement(const QString &namespaceURI, const QString &localName, const QString &qName) override
0446         {
0447             --depth;
0448             if(depth == 0) {
0449                 Parser::Event *e = new Parser::Event;
0450                 e->setDocumentClose(namespaceURI, localName, qName);
0451                 e->setActualString(in->lastString());
0452                 in->resetLastData();
0453                 eventList.append(e);
0454                 in->pause(true);
0455             }
0456             else {
0457                 // done with a depth 1 element?
0458                 if(depth == 1) {
0459                     Parser::Event *e = new Parser::Event;
0460                     e->setElement(elem);
0461                     e->setActualString(in->lastString());
0462                     in->resetLastData();
0463                     eventList.append(e);
0464                     in->pause(true);
0465 
0466                     elem = QDomElement();
0467                     current = QDomElement();
0468                 }
0469                 else
0470                     current = current.parentNode().toElement();
0471             }
0472 
0473             if(in->lastRead() == '/')
0474                 checkNeedMore();
0475 
0476             return true;
0477         }
0478 
0479         bool characters(const QString &str) override
0480         {
0481             if(depth >= 1) {
0482                 QString content = str;
0483                 if(content.isEmpty())
0484                     return true;
0485 
0486                 if(!current.isNull()) {
0487                     QDomText text = doc->createTextNode(content);
0488                     current.appendChild(text);
0489                 }
0490             }
0491             return true;
0492         }
0493 
0494         /*bool processingInstruction(const QString &target, const QString &data)
0495         {
0496             printf("Processing: [%s], [%s]\n", target.latin1(), data.latin1());
0497             in->resetLastData();
0498             return true;
0499         }*/
0500 
0501         void checkNeedMore()
0502         {
0503             // Here we will work around QXmlSimpleReader strangeness and self-closing tags.
0504             // The problem is that endElement() is called when the '/' is read, not when
0505             // the final '>' is read.  This is a potential problem when obtaining unprocessed
0506             // bytes from StreamInput after this event, as the '>' character will end up
0507             // in the unprocessed chunk.  To work around this, we need to advance StreamInput's
0508             // internal byte processing, but not the xml character data.  This way, the '>'
0509             // will get processed and will no longer be in the unprocessed return, but
0510             // QXmlSimpleReader can still read it.  To do this, we call StreamInput::readNext
0511             // with 'peek' mode.
0512             QChar c = in->readNext(true); // peek
0513             if(c == QXmlInputSource::EndOfData) {
0514                 needMore = true;
0515             }
0516             else {
0517                 // We'll assume the next char is a '>'.  If it isn't, then
0518                 // QXmlSimpleReader will deal with that problem on the next
0519                 // parse.  We don't need to take any action here.
0520                 needMore = false;
0521 
0522                 // there should have been a pending event
0523                 if (!eventList.isEmpty()) {
0524                     Parser::Event *e = eventList.first();
0525                     e->setActualString(e->actualString() + '>');
0526                     in->resetLastData();
0527                 }
0528             }
0529         }
0530 
0531         Parser::Event *takeEvent()
0532         {
0533             if(needMore)
0534                 return 0;
0535             if(eventList.isEmpty())
0536                 return 0;
0537 
0538             Parser::Event *e = eventList.takeFirst();
0539             in->pause(false);
0540             return e;
0541         }
0542 
0543         StreamInput *in;
0544         QDomDocument *doc;
0545         int depth;
0546         QStringList nsnames, nsvalues;
0547         QDomElement elem, current;
0548         QList<Parser::Event*> eventList;
0549         bool needMore;
0550     };
0551 }
0552 
0553 
0554 //----------------------------------------------------------------------------
0555 // Event
0556 //----------------------------------------------------------------------------
0557 class Parser::Event::Private
0558 {
0559 public:
0560     int type;
0561     QString ns, ln, qn;
0562     QXmlAttributes a;
0563     QDomElement e;
0564     QString str;
0565     QStringList nsnames, nsvalues;
0566 };
0567 
0568 Parser::Event::Event()
0569 {
0570     d = 0;
0571 }
0572 
0573 Parser::Event::Event(const Event &from)
0574 {
0575     d = 0;
0576     *this = from;
0577 }
0578 
0579 Parser::Event & Parser::Event::operator=(const Event &from)
0580 {
0581     delete d;
0582     d = 0;
0583     if(from.d)
0584         d = new Private(*from.d);
0585     return *this;
0586 }
0587 
0588 Parser::Event::~Event()
0589 {
0590     delete d;
0591 }
0592 
0593 bool Parser::Event::isNull() const
0594 {
0595     return (d ? false: true);
0596 }
0597 
0598 int Parser::Event::type() const
0599 {
0600     if(isNull())
0601         return -1;
0602     return d->type;
0603 }
0604 
0605 QString Parser::Event::nsprefix(const QString &s) const
0606 {
0607     QStringList::ConstIterator it = d->nsnames.constBegin();
0608     QStringList::ConstIterator it2 = d->nsvalues.constBegin();
0609     for(; it != d->nsnames.constEnd(); ++it) {
0610         if((*it) == s)
0611             return (*it2);
0612         ++it2;
0613     }
0614     return QString();
0615 }
0616 
0617 QString Parser::Event::namespaceURI() const
0618 {
0619     return d->ns;
0620 }
0621 
0622 QString Parser::Event::localName() const
0623 {
0624     return d->ln;
0625 }
0626 
0627 QString Parser::Event::qName() const
0628 {
0629     return d->qn;
0630 }
0631 
0632 QXmlAttributes Parser::Event::atts() const
0633 {
0634     return d->a;
0635 }
0636 
0637 QString Parser::Event::actualString() const
0638 {
0639     return d->str;
0640 }
0641 
0642 QDomElement Parser::Event::element() const
0643 {
0644     return d->e;
0645 }
0646 
0647 void Parser::Event::setDocumentOpen(const QString &namespaceURI, const QString &localName, const QString &qName, const QXmlAttributes &atts, const QStringList &nsnames, const QStringList &nsvalues)
0648 {
0649     if(!d)
0650         d = new Private;
0651     d->type = DocumentOpen;
0652     d->ns = namespaceURI;
0653     d->ln = localName;
0654     d->qn = qName;
0655     d->a = atts;
0656     d->nsnames = nsnames;
0657     d->nsvalues = nsvalues;
0658 }
0659 
0660 void Parser::Event::setDocumentClose(const QString &namespaceURI, const QString &localName, const QString &qName)
0661 {
0662     if(!d)
0663         d = new Private;
0664     d->type = DocumentClose;
0665     d->ns = namespaceURI;
0666     d->ln = localName;
0667     d->qn = qName;
0668 }
0669 
0670 void Parser::Event::setElement(const QDomElement &elem)
0671 {
0672     if(!d)
0673         d = new Private;
0674     d->type = Element;
0675     d->e = elem;
0676 }
0677 
0678 void Parser::Event::setError()
0679 {
0680     if(!d)
0681         d = new Private;
0682     d->type = Error;
0683 }
0684 
0685 void Parser::Event::setActualString(const QString &str)
0686 {
0687     d->str = str;
0688 }
0689 
0690 //----------------------------------------------------------------------------
0691 // Parser
0692 //----------------------------------------------------------------------------
0693 class Parser::Private
0694 {
0695 public:
0696     Private()
0697     {
0698         doc = 0;
0699         in = 0;
0700         handler = 0;
0701         reader = 0;
0702         reset();
0703     }
0704 
0705     ~Private()
0706     {
0707         reset(false);
0708     }
0709 
0710     void reset(bool create=true)
0711     {
0712         delete reader;
0713         delete handler;
0714         delete in;
0715         delete doc;
0716 
0717         if(create) {
0718             doc = new QDomDocument;
0719             in = new StreamInput;
0720             handler = new ParserHandler(in, doc);
0721             reader = new QXmlSimpleReader;
0722             reader->setContentHandler(handler);
0723 
0724             // initialize the reader
0725             in->pause(true);
0726             reader->parse(in, true);
0727             in->pause(false);
0728         }
0729     }
0730 
0731     QDomDocument *doc;
0732     StreamInput *in;
0733     ParserHandler *handler;
0734     QXmlSimpleReader *reader;
0735 };
0736 
0737 Parser::Parser()
0738 {
0739     d = new Private;
0740 
0741     // check for evil bug in Qt <= 3.2.1
0742     if(!qt_bug_check) {
0743         qt_bug_check = true;
0744         QDomElement e = d->doc->createElementNS("someuri", "somename");
0745         if(e.hasAttributeNS("someuri", "somename"))
0746             qt_bug_have = true;
0747         else
0748             qt_bug_have = false;
0749     }
0750 }
0751 
0752 Parser::~Parser()
0753 {
0754     delete d;
0755 }
0756 
0757 void Parser::reset()
0758 {
0759     d->reset();
0760 }
0761 
0762 void Parser::appendData(const QByteArray &a)
0763 {
0764     d->in->appendData(a);
0765 
0766     // if handler was waiting for more, give it a kick
0767     if(d->handler->needMore)
0768         d->handler->checkNeedMore();
0769 }
0770 
0771 Parser::Event Parser::readNext()
0772 {
0773     Event e;
0774     if(d->handler->needMore)
0775         return e;
0776     Event *ep = d->handler->takeEvent();
0777     if(!ep) {
0778         if(!d->reader->parseContinue()) {
0779             e.setError();
0780             return e;
0781         }
0782         ep = d->handler->takeEvent();
0783         if(!ep)
0784             return e;
0785     }
0786     e = *ep;
0787     delete ep;
0788     return e;
0789 }
0790 
0791 QByteArray Parser::unprocessed() const
0792 {
0793     return d->in->unprocessed();
0794 }
0795 
0796 QString Parser::encoding() const
0797 {
0798     return d->in->encoding();
0799 }