kioworkers/http/parsinghelpers.cpp

0001 /*
0002     This file is part of the KDE libraries
0003     SPDX-FileCopyrightText: 2008 Andreas Hartmetz <ahartmetz@gmail.com>
0004     SPDX-FileCopyrightText: 2010, 2011 Rolf Eike Beer <kde@opensource.sf-tec.de>
0005
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008
0009 #include "parsinghelpers.h"
0010
0011 #include <ctype.h>
0012
0013 #include <QDebug>
0014 #include <QDir>
0015 #include <QTextCodec>
0016
0017 // Advance *pos beyond spaces / tabs
0018 static void skipSpace(const char input[], int *pos, int end)
0019 {
0020     int idx = *pos;
0021     while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) {
0022         idx++;
0023     }
0024     *pos = idx;
0025     return;
0026 }
0027
0028 // Advance *pos to start of next line while being forgiving about line endings.
0029 // Return false if the end of the header has been reached, true otherwise.
0030 static bool nextLine(const char input[], int *pos, int end)
0031 {
0032     int idx = *pos;
0033     while (idx < end && input[idx] != '\r' && input[idx] != '\n') {
0034         idx++;
0035     }
0036     int rCount = 0;
0037     int nCount = 0;
0038     while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) {
0039         input[idx] == '\r' ? rCount++ : nCount++;
0040         idx++;
0041     }
0042     if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) {
0043         // if just one of the others is missing eat it too.
0044         // this ensures that conforming headers using the proper
0045         // \r\n sequence (and also \n\r) will be parsed correctly.
0046         if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) {
0047             idx++;
0048         }
0049     }
0050
0051     *pos = idx;
0052     return idx < end && rCount < 2 && nCount < 2;
0053 }
0054
0055 // QByteArray::fromPercentEncoding() does not notify us about encoding errors so we need
0056 // to check here if this is valid at all.
0057 static bool isValidPercentEncoding(const QByteArray &data)
0058 {
0059     int i = 0;
0060     const int last = data.length() - 1;
0061     const char *d = data.constData();
0062
0063     while ((i = data.indexOf('%', i)) != -1) {
0064         if (i >= last - 2) {
0065             return false;
0066         }
0067         if (!isxdigit(d[i + 1])) {
0068             return false;
0069         }
0070         if (!isxdigit(d[i + 2])) {
0071             return false;
0072         }
0073         i++;
0074     }
0075
0076     return true;
0077 }
0078
0079 QByteArray TokenIterator::next()
0080 {
0081     const auto [startIdx, endIdx] = m_tokens[m_currentToken++];
0082     // fromRawData brings some speed advantage but also the requirement to keep the text buffer
0083     // around. this together with implicit sharing (you don't know where copies end up)
0084     // is dangerous!
0085     // return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
0086     return QByteArray(&m_buffer[startIdx], endIdx - startIdx);
0087 }
0088
0089 QByteArray TokenIterator::current() const
0090 {
0091     const auto [startIdx, endIdx] = m_tokens[m_currentToken - 1];
0092     // return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first);
0093     return QByteArray(&m_buffer[startIdx], endIdx - startIdx);
0094 }
0095
0096 QList<QByteArray> TokenIterator::all() const
0097 {
0098     QList<QByteArray> ret;
0099     ret.reserve(m_tokens.count());
0100     for (int i = 0; i < m_tokens.count(); i++) {
0101         const auto [startIdx, endIdx] = m_tokens[i];
0102         ret.append(QByteArray(&m_buffer[startIdx], endIdx - startIdx));
0103     }
0104     return ret;
0105 }
0106
0107 HeaderTokenizer::HeaderTokenizer(char *buffer)
0108     : m_buffer(buffer)
0109 {
0110     // add information about available headers and whether they have one or multiple,
0111     // comma-separated values.
0112
0113     // The following response header fields are from RFC 2616 unless otherwise specified.
0114     // Hint: search the web for e.g. 'http "accept-ranges header"' to find information about
0115     // a header field.
0116     static const HeaderFieldTemplate headerFieldTemplates[] = {
0117         {"accept-ranges", false},
0118         {"age", false},
0119         {"cache-control", true},
0120         {"connection", true},
0121         {"content-disposition", false}, // is multi-valued in a way, but with ";" separator!
0122         {"content-encoding", true},
0123         {"content-language", true},
0124         {"content-length", false},
0125         {"content-location", false},
0126         {"content-md5", false},
0127         {"content-type", false},
0128         {"date", false},
0129         {"dav", true}, // RFC 2518
0130         {"etag", false},
0131         {"expires", false},
0132         {"keep-alive", true}, // RFC 2068
0133         {"last-modified", false},
0134         {"link", false}, // RFC 2068, multi-valued with ";" separator
0135         {"location", false},
0136         {"p3p", true}, // https://www.w3.org/TR/P3P/
0137         {"pragma", true},
0138         {"proxy-authenticate", false}, // complicated multi-valuedness: quoted commas don't separate
0139         // multiple values. we handle this at a higher level.
0140         {"proxy-connection", true}, // unofficial but well-known; to avoid misunderstandings
0141         // when using "connection" when talking to a proxy.
0142         {"refresh", false}, // not sure, only found some mailing list posts mentioning it
0143         {"set-cookie", false}, // RFC 2109; the multi-valuedness seems to be usually achieved
0144         // by sending several instances of this field as opposed to
0145         // usually comma-separated lists with maybe multiple instances.
0146         {"transfer-encoding", true},
0147         {"upgrade", true},
0148         {"warning", true},
0149         {"www-authenticate", false} // see proxy-authenticate
0150     };
0151
0152     for (const HeaderFieldTemplate &ft : headerFieldTemplates) {
0153         insert(QByteArray(ft.name), HeaderField(ft.isMultiValued));
0154     }
0155 }
0156
0157 int HeaderTokenizer::tokenize(int begin, int end)
0158 {
0159     char *buf = m_buffer; // keep line length in check :/
0160     int idx = begin;
0161     int startIdx = begin; // multi-purpose start of current token
0162     bool multiValuedEndedWithComma = false; // did the last multi-valued line end with a comma?
0163     QByteArray headerKey;
0164     do {
0165         if (buf[idx] == ' ' || buf[idx] == '\t') {
0166             // line continuation; preserve startIdx except (see below)
0167             if (headerKey.isEmpty()) {
0168                 continue;
0169             }
0170             // turn CR/LF into spaces for later parsing convenience
0171             int backIdx = idx - 1;
0172             while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) {
0173                 buf[backIdx--] = ' ';
0174             }
0175
0176             // multiple values, comma-separated: add new value or continue previous?
0177             if (operator[](headerKey).isMultiValued) {
0178                 if (multiValuedEndedWithComma) {
0179                     // start new value; this is almost like no line continuation
0180                     skipSpace(buf, &idx, end);
0181                     startIdx = idx;
0182                 } else {
0183                     // continue previous value; this is tricky. unit tests to the rescue!
0184                     if (operator[](headerKey).beginEnd.last().startIndex == startIdx) {
0185                         // remove entry, it will be re-added because already idx != startIdx
0186                         operator[](headerKey).beginEnd.removeLast();
0187                     } else {
0188                         // no comma, no entry: the prev line was whitespace only - start new value
0189                         skipSpace(buf, &idx, end);
0190                         startIdx = idx;
0191                     }
0192                 }
0193             }
0194
0195         } else {
0196             // new field
0197             startIdx = idx;
0198             // also make sure that there is at least one char after the colon
0199             while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') {
0200                 buf[idx] = tolower(buf[idx]);
0201                 idx++;
0202             }
0203             if (buf[idx] != ':') {
0204                 // malformed line: no colon
0205                 headerKey.clear();
0206                 continue;
0207             }
0208             headerKey = QByteArray(&buf[startIdx], idx - startIdx);
0209             if (!contains(headerKey)) {
0210                 // we don't recognize this header line
0211                 headerKey.clear();
0212                 continue;
0213             }
0214             // skip colon & leading whitespace
0215             idx++;
0216             skipSpace(buf, &idx, end);
0217             startIdx = idx;
0218         }
0219
0220         // we have the name/key of the field, now parse the value
0221         if (!operator[](headerKey).isMultiValued) {
0222             // scan to end of line
0223             while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') {
0224                 idx++;
0225             }
0226             if (!operator[](headerKey).beginEnd.isEmpty()) {
0227                 // there already is an entry; are we just in a line continuation?
0228                 if (operator[](headerKey).beginEnd.last().startIndex == startIdx) {
0229                     // line continuation: delete previous entry and later insert a new, longer one.
0230                     operator[](headerKey).beginEnd.removeLast();
0231                 }
0232             }
0233             operator[](headerKey).beginEnd.append({startIdx, idx});
0234
0235         } else {
0236             // comma-separated list
0237             while (true) {
0238                 // skip one value
0239                 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') {
0240                     idx++;
0241                 }
0242                 if (idx != startIdx) {
0243                     operator[](headerKey).beginEnd.append({startIdx, idx});
0244                 }
0245                 multiValuedEndedWithComma = buf[idx] == ',';
0246                 // skip comma(s) and leading whitespace, if any respectively
0247                 while (idx < end && buf[idx] == ',') {
0248                     idx++;
0249                 }
0250                 skipSpace(buf, &idx, end);
0251                 // next value or end-of-line / end of header?
0252                 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') {
0253                     break;
0254                 }
0255                 // next value
0256                 startIdx = idx;
0257             }
0258         }
0259     } while (nextLine(buf, &idx, end));
0260     return idx;
0261 }
0262
0263 TokenIterator HeaderTokenizer::iterator(const char *key) const
0264 {
0265     QByteArray keyBa = QByteArray::fromRawData(key, strlen(key));
0266     if (contains(keyBa)) {
0267         return TokenIterator(value(keyBa).beginEnd, m_buffer);
0268     } else {
0269         return TokenIterator(m_nullTokens, m_buffer);
0270     }
0271 }
0272
0273 static void skipLWS(const QString &str, int &pos)
0274 {
0275     while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) {
0276         ++pos;
0277     }
0278 }
0279
0280 // keep the common ending, this allows the compiler to join them
0281 static const char typeSpecials[] = "{}*'%()<>@,;:\\\"/[]?=";
0282 static const char attrSpecials[] = "'%()<>@,;:\\\"/[]?=";
0283 static const char valueSpecials[] = "()<>@,;:\\\"/[]?=";
0284
0285 static bool specialChar(const QChar &ch, const char *specials)
0286 {
0287     // WORKAROUND: According to RFC 2616, any character other than ascii
0288     // characters should NOT be allowed in unquoted content-disposition file
0289     // names. However, since none of the major browsers follow this rule, we do
0290     // the same thing here and allow all printable unicode characters. See
0291     // https://bugs.kde.org/show_bug.cgi?id=261223 for the details.
0292     if (!ch.isPrint()) {
0293         return true;
0294     }
0295
0296     for (int i = qstrlen(specials) - 1; i >= 0; i--) {
0297         if (ch == QLatin1Char(specials[i])) {
0298             return true;
0299         }
0300     }
0301
0302     return false;
0303 }
0304
0305 /**
0306  * read and parse the input until the given terminator
0307  * @param str input string to parse
0308  * @param term terminator
0309  * @param pos position marker in the input string
0310  * @param specials characters forbidden in this section
0311  * @return the next section or an empty string if it was invalid
0312  *
0313  * Extracts token-like input until terminator char or EOL.
0314  * Also skips over the terminator.
0315  *
0316  * pos is correctly incremented even if this functions returns
0317  * an empty string so this can be used to skip over invalid
0318  * parts and continue.
0319  */
0320 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials)
0321 {
0322     QString out;
0323     skipLWS(str, pos);
0324     bool valid = true;
0325
0326     while (pos < str.length() && (str[pos] != term)) {
0327         out += str[pos];
0328         valid = (valid && !specialChar(str[pos], specials));
0329         ++pos;
0330     }
0331
0332     if (pos < str.length()) { // Stopped due to finding term
0333         ++pos;
0334     }
0335
0336     if (!valid) {
0337         return QString();
0338     }
0339
0340     // Remove trailing linear whitespace...
0341     while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) {
0342         out.chop(1);
0343     }
0344
0345     if (out.contains(QLatin1Char(' '))) {
0346         out.clear();
0347     }
0348
0349     return out;
0350 }
0351
0352 // As above, but also handles quotes..
0353 // pos is set to -1 on parse error
0354 static QString extractMaybeQuotedUntil(const QString &str, int &pos)
0355 {
0356     const QChar term = QLatin1Char(';');
0357
0358     skipLWS(str, pos);
0359
0360     // Are we quoted?
0361     if (pos < str.length() && str[pos] == QLatin1Char('"')) {
0362         QString out;
0363
0364         // Skip the quote...
0365         ++pos;
0366
0367         // when quoted we also need an end-quote
0368         bool endquote = false;
0369
0370         // Parse until trailing quote...
0371         while (pos < str.length()) {
0372             if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) {
0373                 // quoted-pair = "\" CHAR
0374                 out += str[pos + 1];
0375                 pos += 2; // Skip both...
0376             } else if (str[pos] == QLatin1Char('"')) {
0377                 ++pos;
0378                 endquote = true;
0379                 break;
0380             } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2
0381                 break;
0382             } else {
0383                 out += str[pos];
0384                 ++pos;
0385             }
0386         }
0387
0388         if (!endquote) {
0389             pos = -1;
0390             return QString();
0391         }
0392
0393         // Skip until term..
0394         while (pos < str.length() && (str[pos] != term)) {
0395             if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) {
0396                 pos = -1;
0397                 return QString();
0398             }
0399             ++pos;
0400         }
0401
0402         if (pos < str.length()) { // Stopped due to finding term
0403             ++pos;
0404         }
0405
0406         return out;
0407     } else {
0408         return extractUntil(str, term, pos, valueSpecials);
0409     }
0410 }
0411
0412 static QMap<QString, QString> contentDispositionParserInternal(const QString &disposition)
0413 {
0414     // qDebug() << "disposition: " << disposition;
0415     int pos = 0;
0416     const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower();
0417
0418     QMap<QString, QString> parameters;
0419     QMap<QString, QString> contparams; // all parameters that contain continuations
0420     QMap<QString, QString> encparams; // all parameters that have character encoding
0421
0422     // the type is invalid, the complete header is junk
0423     if (strDisposition.isEmpty()) {
0424         return parameters;
0425     }
0426
0427     parameters.insert(QStringLiteral("type"), strDisposition);
0428
0429     while (pos < disposition.length()) {
0430         QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower();
0431
0432         if (key.isEmpty()) {
0433             // parse error in this key: do not parse more, but add up
0434             // everything we already got
0435             // qDebug() << "parse error in key, abort parsing";
0436             break;
0437         }
0438
0439         QString val;
0440         if (key.endsWith(QLatin1Char('*'))) {
0441             val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials);
0442         } else {
0443             val = extractMaybeQuotedUntil(disposition, pos);
0444         }
0445
0446         if (val.isEmpty()) {
0447             if (pos == -1) {
0448                 // qDebug() << "parse error in value, abort parsing";
0449                 break;
0450             }
0451             continue;
0452         }
0453
0454         const int spos = key.indexOf(QLatin1Char('*'));
0455         if (spos == key.length() - 1) {
0456             key.chop(1);
0457             encparams.insert(key, val);
0458         } else if (spos >= 0) {
0459             contparams.insert(key, val);
0460         } else if (parameters.contains(key)) {
0461             // qDebug() << "duplicate key" << key << "found, ignoring everything more";
0462             parameters.remove(key);
0463             return parameters;
0464         } else {
0465             parameters.insert(key, val);
0466         }
0467     }
0468
0469     QMap<QString, QString>::iterator i = contparams.begin();
0470     while (i != contparams.end()) {
0471         QString key = i.key();
0472         int spos = key.indexOf(QLatin1Char('*'));
0473         bool hasencoding = false;
0474
0475         if (key.at(spos + 1) != QLatin1Char('0')) {
0476             ++i;
0477             continue;
0478         }
0479
0480         // no leading zeros allowed, so delete the junk
0481         int klen = key.length();
0482         if (klen > spos + 2) {
0483             // nothing but continuations and encodings may insert * into parameter name
0484             if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) {
0485                 // qDebug() << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2);
0486                 i = contparams.erase(i);
0487                 continue;
0488             }
0489             hasencoding = true;
0490         }
0491
0492         int seqnum = 1;
0493         QMap<QString, QString>::iterator partsi;
0494         // we do not need to care about encoding specifications: only the first
0495         // part is allowed to have one
0496         QString val = i.value();
0497
0498         key.chop(hasencoding ? 2 : 1);
0499
0500         while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) {
0501             val += partsi.value();
0502             contparams.erase(partsi);
0503         }
0504
0505         i = contparams.erase(i);
0506
0507         key.chop(1);
0508         if (hasencoding) {
0509             encparams.insert(key, val);
0510         } else {
0511             if (parameters.contains(key)) {
0512                 // qDebug() << "duplicate key" << key << "found, ignoring everything more";
0513                 parameters.remove(key);
0514                 return parameters;
0515             }
0516
0517             parameters.insert(key, val);
0518         }
0519     }
0520
0521     for (QMap<QString, QString>::iterator i = encparams.begin(); i != encparams.end(); ++i) {
0522         QString val = i.value();
0523
0524         // RfC 2231 encoded character set in filename
0525         int spos = val.indexOf(QLatin1Char('\''));
0526         if (spos == -1) {
0527             continue;
0528         }
0529         int npos = val.indexOf(QLatin1Char('\''), spos + 1);
0530         if (npos == -1) {
0531             continue;
0532         }
0533
0534         const QStringView strView(val);
0535
0536         const QByteArray encodedVal = strView.mid(npos + 1).toLatin1();
0537
0538         if (!isValidPercentEncoding(encodedVal)) {
0539             continue;
0540         }
0541
0542         const QByteArray rawval = QByteArray::fromPercentEncoding(encodedVal);
0543
0544         const QStringView charset = strView.left(spos);
0545         if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) {
0546             bool valid = true;
0547             for (int j = rawval.length() - 1; (j >= 0) && valid; j--) {
0548                 valid = (rawval.at(j) >= 32);
0549             }
0550
0551             if (!valid) {
0552                 continue;
0553             }
0554             val = QString::fromLatin1(rawval.constData());
0555         } else {
0556             QTextCodec *codec = QTextCodec::codecForName(charset.toLatin1());
0557             if (!codec) {
0558                 continue;
0559             }
0560             val = codec->toUnicode(rawval);
0561         }
0562
0563         parameters.insert(i.key(), val);
0564     }
0565
0566     return parameters;
0567 }
0568
0569 static QMap<QString, QString> contentDispositionParser(const QString &disposition)
0570 {
0571     QMap<QString, QString> parameters = contentDispositionParserInternal(disposition);
0572
0573     const QLatin1String fn("filename");
0574     if (parameters.contains(fn)) {
0575         // Content-Disposition is not allowed to dictate directory
0576         // path, thus we extract the filename only.
0577         const QString val = QDir::toNativeSeparators(parameters[fn]);
0578         int slpos = val.lastIndexOf(QDir::separator());
0579
0580         if (slpos > -1) {
0581             parameters.insert(fn, val.mid(slpos + 1));
0582         }
0583     }
0584
0585     return parameters;
0586 }