File indexing completed on 2025-03-16 10:02:23
0001 /* 0002 This file is part of the KDE libraries 0003 SPDX-FileCopyrightText: 2008 Andreas Hartmetz <ahartmetz@gmail.com> 0004 SPDX-FileCopyrightText: 2010, 2011 Rolf Eike Beer <kde@opensource.sf-tec.de> 0005 0006 SPDX-License-Identifier: LGPL-2.0-or-later 0007 */ 0008 0009 #include "parsinghelpers.h" 0010 0011 #include <ctype.h> 0012 0013 #include <QDebug> 0014 #include <QDir> 0015 #include <QTextCodec> 0016 0017 // Advance *pos beyond spaces / tabs 0018 static void skipSpace(const char input[], int *pos, int end) 0019 { 0020 int idx = *pos; 0021 while (idx < end && (input[idx] == ' ' || input[idx] == '\t')) { 0022 idx++; 0023 } 0024 *pos = idx; 0025 return; 0026 } 0027 0028 // Advance *pos to start of next line while being forgiving about line endings. 0029 // Return false if the end of the header has been reached, true otherwise. 0030 static bool nextLine(const char input[], int *pos, int end) 0031 { 0032 int idx = *pos; 0033 while (idx < end && input[idx] != '\r' && input[idx] != '\n') { 0034 idx++; 0035 } 0036 int rCount = 0; 0037 int nCount = 0; 0038 while (idx < end && qMax(rCount, nCount) < 2 && (input[idx] == '\r' || input[idx] == '\n')) { 0039 input[idx] == '\r' ? rCount++ : nCount++; 0040 idx++; 0041 } 0042 if (idx < end && qMax(rCount, nCount) == 2 && qMin(rCount, nCount) == 1) { 0043 // if just one of the others is missing eat it too. 0044 // this ensures that conforming headers using the proper 0045 // \r\n sequence (and also \n\r) will be parsed correctly. 0046 if ((rCount == 1 && input[idx] == '\r') || (nCount == 1 && input[idx] == '\n')) { 0047 idx++; 0048 } 0049 } 0050 0051 *pos = idx; 0052 return idx < end && rCount < 2 && nCount < 2; 0053 } 0054 0055 // QByteArray::fromPercentEncoding() does not notify us about encoding errors so we need 0056 // to check here if this is valid at all. 0057 static bool isValidPercentEncoding(const QByteArray &data) 0058 { 0059 int i = 0; 0060 const int last = data.length() - 1; 0061 const char *d = data.constData(); 0062 0063 while ((i = data.indexOf('%', i)) != -1) { 0064 if (i >= last - 2) { 0065 return false; 0066 } 0067 if (!isxdigit(d[i + 1])) { 0068 return false; 0069 } 0070 if (!isxdigit(d[i + 2])) { 0071 return false; 0072 } 0073 i++; 0074 } 0075 0076 return true; 0077 } 0078 0079 QByteArray TokenIterator::next() 0080 { 0081 const auto [startIdx, endIdx] = m_tokens[m_currentToken++]; 0082 // fromRawData brings some speed advantage but also the requirement to keep the text buffer 0083 // around. this together with implicit sharing (you don't know where copies end up) 0084 // is dangerous! 0085 // return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 0086 return QByteArray(&m_buffer[startIdx], endIdx - startIdx); 0087 } 0088 0089 QByteArray TokenIterator::current() const 0090 { 0091 const auto [startIdx, endIdx] = m_tokens[m_currentToken - 1]; 0092 // return QByteArray::fromRawData(&m_buffer[token.first], token.second - token.first); 0093 return QByteArray(&m_buffer[startIdx], endIdx - startIdx); 0094 } 0095 0096 QList<QByteArray> TokenIterator::all() const 0097 { 0098 QList<QByteArray> ret; 0099 ret.reserve(m_tokens.count()); 0100 for (int i = 0; i < m_tokens.count(); i++) { 0101 const auto [startIdx, endIdx] = m_tokens[i]; 0102 ret.append(QByteArray(&m_buffer[startIdx], endIdx - startIdx)); 0103 } 0104 return ret; 0105 } 0106 0107 HeaderTokenizer::HeaderTokenizer(char *buffer) 0108 : m_buffer(buffer) 0109 { 0110 // add information about available headers and whether they have one or multiple, 0111 // comma-separated values. 0112 0113 // The following response header fields are from RFC 2616 unless otherwise specified. 0114 // Hint: search the web for e.g. 'http "accept-ranges header"' to find information about 0115 // a header field. 0116 static const HeaderFieldTemplate headerFieldTemplates[] = { 0117 {"accept-ranges", false}, 0118 {"age", false}, 0119 {"cache-control", true}, 0120 {"connection", true}, 0121 {"content-disposition", false}, // is multi-valued in a way, but with ";" separator! 0122 {"content-encoding", true}, 0123 {"content-language", true}, 0124 {"content-length", false}, 0125 {"content-location", false}, 0126 {"content-md5", false}, 0127 {"content-type", false}, 0128 {"date", false}, 0129 {"dav", true}, // RFC 2518 0130 {"etag", false}, 0131 {"expires", false}, 0132 {"keep-alive", true}, // RFC 2068 0133 {"last-modified", false}, 0134 {"link", false}, // RFC 2068, multi-valued with ";" separator 0135 {"location", false}, 0136 {"p3p", true}, // https://www.w3.org/TR/P3P/ 0137 {"pragma", true}, 0138 {"proxy-authenticate", false}, // complicated multi-valuedness: quoted commas don't separate 0139 // multiple values. we handle this at a higher level. 0140 {"proxy-connection", true}, // unofficial but well-known; to avoid misunderstandings 0141 // when using "connection" when talking to a proxy. 0142 {"refresh", false}, // not sure, only found some mailing list posts mentioning it 0143 {"set-cookie", false}, // RFC 2109; the multi-valuedness seems to be usually achieved 0144 // by sending several instances of this field as opposed to 0145 // usually comma-separated lists with maybe multiple instances. 0146 {"transfer-encoding", true}, 0147 {"upgrade", true}, 0148 {"warning", true}, 0149 {"www-authenticate", false} // see proxy-authenticate 0150 }; 0151 0152 for (const HeaderFieldTemplate &ft : headerFieldTemplates) { 0153 insert(QByteArray(ft.name), HeaderField(ft.isMultiValued)); 0154 } 0155 } 0156 0157 int HeaderTokenizer::tokenize(int begin, int end) 0158 { 0159 char *buf = m_buffer; // keep line length in check :/ 0160 int idx = begin; 0161 int startIdx = begin; // multi-purpose start of current token 0162 bool multiValuedEndedWithComma = false; // did the last multi-valued line end with a comma? 0163 QByteArray headerKey; 0164 do { 0165 if (buf[idx] == ' ' || buf[idx] == '\t') { 0166 // line continuation; preserve startIdx except (see below) 0167 if (headerKey.isEmpty()) { 0168 continue; 0169 } 0170 // turn CR/LF into spaces for later parsing convenience 0171 int backIdx = idx - 1; 0172 while (backIdx >= begin && (buf[backIdx] == '\r' || buf[backIdx] == '\n')) { 0173 buf[backIdx--] = ' '; 0174 } 0175 0176 // multiple values, comma-separated: add new value or continue previous? 0177 if (operator[](headerKey).isMultiValued) { 0178 if (multiValuedEndedWithComma) { 0179 // start new value; this is almost like no line continuation 0180 skipSpace(buf, &idx, end); 0181 startIdx = idx; 0182 } else { 0183 // continue previous value; this is tricky. unit tests to the rescue! 0184 if (operator[](headerKey).beginEnd.last().startIndex == startIdx) { 0185 // remove entry, it will be re-added because already idx != startIdx 0186 operator[](headerKey).beginEnd.removeLast(); 0187 } else { 0188 // no comma, no entry: the prev line was whitespace only - start new value 0189 skipSpace(buf, &idx, end); 0190 startIdx = idx; 0191 } 0192 } 0193 } 0194 0195 } else { 0196 // new field 0197 startIdx = idx; 0198 // also make sure that there is at least one char after the colon 0199 while (idx < (end - 1) && buf[idx] != ':' && buf[idx] != '\r' && buf[idx] != '\n') { 0200 buf[idx] = tolower(buf[idx]); 0201 idx++; 0202 } 0203 if (buf[idx] != ':') { 0204 // malformed line: no colon 0205 headerKey.clear(); 0206 continue; 0207 } 0208 headerKey = QByteArray(&buf[startIdx], idx - startIdx); 0209 if (!contains(headerKey)) { 0210 // we don't recognize this header line 0211 headerKey.clear(); 0212 continue; 0213 } 0214 // skip colon & leading whitespace 0215 idx++; 0216 skipSpace(buf, &idx, end); 0217 startIdx = idx; 0218 } 0219 0220 // we have the name/key of the field, now parse the value 0221 if (!operator[](headerKey).isMultiValued) { 0222 // scan to end of line 0223 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n') { 0224 idx++; 0225 } 0226 if (!operator[](headerKey).beginEnd.isEmpty()) { 0227 // there already is an entry; are we just in a line continuation? 0228 if (operator[](headerKey).beginEnd.last().startIndex == startIdx) { 0229 // line continuation: delete previous entry and later insert a new, longer one. 0230 operator[](headerKey).beginEnd.removeLast(); 0231 } 0232 } 0233 operator[](headerKey).beginEnd.append({startIdx, idx}); 0234 0235 } else { 0236 // comma-separated list 0237 while (true) { 0238 // skip one value 0239 while (idx < end && buf[idx] != '\r' && buf[idx] != '\n' && buf[idx] != ',') { 0240 idx++; 0241 } 0242 if (idx != startIdx) { 0243 operator[](headerKey).beginEnd.append({startIdx, idx}); 0244 } 0245 multiValuedEndedWithComma = buf[idx] == ','; 0246 // skip comma(s) and leading whitespace, if any respectively 0247 while (idx < end && buf[idx] == ',') { 0248 idx++; 0249 } 0250 skipSpace(buf, &idx, end); 0251 // next value or end-of-line / end of header? 0252 if (buf[idx] >= end || buf[idx] == '\r' || buf[idx] == '\n') { 0253 break; 0254 } 0255 // next value 0256 startIdx = idx; 0257 } 0258 } 0259 } while (nextLine(buf, &idx, end)); 0260 return idx; 0261 } 0262 0263 TokenIterator HeaderTokenizer::iterator(const char *key) const 0264 { 0265 QByteArray keyBa = QByteArray::fromRawData(key, strlen(key)); 0266 if (contains(keyBa)) { 0267 return TokenIterator(value(keyBa).beginEnd, m_buffer); 0268 } else { 0269 return TokenIterator(m_nullTokens, m_buffer); 0270 } 0271 } 0272 0273 static void skipLWS(const QString &str, int &pos) 0274 { 0275 while (pos < str.length() && (str[pos] == QLatin1Char(' ') || str[pos] == QLatin1Char('\t'))) { 0276 ++pos; 0277 } 0278 } 0279 0280 // keep the common ending, this allows the compiler to join them 0281 static const char typeSpecials[] = "{}*'%()<>@,;:\\\"/[]?="; 0282 static const char attrSpecials[] = "'%()<>@,;:\\\"/[]?="; 0283 static const char valueSpecials[] = "()<>@,;:\\\"/[]?="; 0284 0285 static bool specialChar(const QChar &ch, const char *specials) 0286 { 0287 // WORKAROUND: According to RFC 2616, any character other than ascii 0288 // characters should NOT be allowed in unquoted content-disposition file 0289 // names. However, since none of the major browsers follow this rule, we do 0290 // the same thing here and allow all printable unicode characters. See 0291 // https://bugs.kde.org/show_bug.cgi?id=261223 for the details. 0292 if (!ch.isPrint()) { 0293 return true; 0294 } 0295 0296 for (int i = qstrlen(specials) - 1; i >= 0; i--) { 0297 if (ch == QLatin1Char(specials[i])) { 0298 return true; 0299 } 0300 } 0301 0302 return false; 0303 } 0304 0305 /** 0306 * read and parse the input until the given terminator 0307 * @param str input string to parse 0308 * @param term terminator 0309 * @param pos position marker in the input string 0310 * @param specials characters forbidden in this section 0311 * @return the next section or an empty string if it was invalid 0312 * 0313 * Extracts token-like input until terminator char or EOL. 0314 * Also skips over the terminator. 0315 * 0316 * pos is correctly incremented even if this functions returns 0317 * an empty string so this can be used to skip over invalid 0318 * parts and continue. 0319 */ 0320 static QString extractUntil(const QString &str, QChar term, int &pos, const char *specials) 0321 { 0322 QString out; 0323 skipLWS(str, pos); 0324 bool valid = true; 0325 0326 while (pos < str.length() && (str[pos] != term)) { 0327 out += str[pos]; 0328 valid = (valid && !specialChar(str[pos], specials)); 0329 ++pos; 0330 } 0331 0332 if (pos < str.length()) { // Stopped due to finding term 0333 ++pos; 0334 } 0335 0336 if (!valid) { 0337 return QString(); 0338 } 0339 0340 // Remove trailing linear whitespace... 0341 while (out.endsWith(QLatin1Char(' ')) || out.endsWith(QLatin1Char('\t'))) { 0342 out.chop(1); 0343 } 0344 0345 if (out.contains(QLatin1Char(' '))) { 0346 out.clear(); 0347 } 0348 0349 return out; 0350 } 0351 0352 // As above, but also handles quotes.. 0353 // pos is set to -1 on parse error 0354 static QString extractMaybeQuotedUntil(const QString &str, int &pos) 0355 { 0356 const QChar term = QLatin1Char(';'); 0357 0358 skipLWS(str, pos); 0359 0360 // Are we quoted? 0361 if (pos < str.length() && str[pos] == QLatin1Char('"')) { 0362 QString out; 0363 0364 // Skip the quote... 0365 ++pos; 0366 0367 // when quoted we also need an end-quote 0368 bool endquote = false; 0369 0370 // Parse until trailing quote... 0371 while (pos < str.length()) { 0372 if (str[pos] == QLatin1Char('\\') && pos + 1 < str.length()) { 0373 // quoted-pair = "\" CHAR 0374 out += str[pos + 1]; 0375 pos += 2; // Skip both... 0376 } else if (str[pos] == QLatin1Char('"')) { 0377 ++pos; 0378 endquote = true; 0379 break; 0380 } else if (!str[pos].isPrint()) { // Don't allow CTL's RFC 2616 sec 2.2 0381 break; 0382 } else { 0383 out += str[pos]; 0384 ++pos; 0385 } 0386 } 0387 0388 if (!endquote) { 0389 pos = -1; 0390 return QString(); 0391 } 0392 0393 // Skip until term.. 0394 while (pos < str.length() && (str[pos] != term)) { 0395 if ((str[pos] != QLatin1Char(' ')) && (str[pos] != QLatin1Char('\t'))) { 0396 pos = -1; 0397 return QString(); 0398 } 0399 ++pos; 0400 } 0401 0402 if (pos < str.length()) { // Stopped due to finding term 0403 ++pos; 0404 } 0405 0406 return out; 0407 } else { 0408 return extractUntil(str, term, pos, valueSpecials); 0409 } 0410 } 0411 0412 static QMap<QString, QString> contentDispositionParserInternal(const QString &disposition) 0413 { 0414 // qDebug() << "disposition: " << disposition; 0415 int pos = 0; 0416 const QString strDisposition = extractUntil(disposition, QLatin1Char(';'), pos, typeSpecials).toLower(); 0417 0418 QMap<QString, QString> parameters; 0419 QMap<QString, QString> contparams; // all parameters that contain continuations 0420 QMap<QString, QString> encparams; // all parameters that have character encoding 0421 0422 // the type is invalid, the complete header is junk 0423 if (strDisposition.isEmpty()) { 0424 return parameters; 0425 } 0426 0427 parameters.insert(QStringLiteral("type"), strDisposition); 0428 0429 while (pos < disposition.length()) { 0430 QString key = extractUntil(disposition, QLatin1Char('='), pos, attrSpecials).toLower(); 0431 0432 if (key.isEmpty()) { 0433 // parse error in this key: do not parse more, but add up 0434 // everything we already got 0435 // qDebug() << "parse error in key, abort parsing"; 0436 break; 0437 } 0438 0439 QString val; 0440 if (key.endsWith(QLatin1Char('*'))) { 0441 val = extractUntil(disposition, QLatin1Char(';'), pos, valueSpecials); 0442 } else { 0443 val = extractMaybeQuotedUntil(disposition, pos); 0444 } 0445 0446 if (val.isEmpty()) { 0447 if (pos == -1) { 0448 // qDebug() << "parse error in value, abort parsing"; 0449 break; 0450 } 0451 continue; 0452 } 0453 0454 const int spos = key.indexOf(QLatin1Char('*')); 0455 if (spos == key.length() - 1) { 0456 key.chop(1); 0457 encparams.insert(key, val); 0458 } else if (spos >= 0) { 0459 contparams.insert(key, val); 0460 } else if (parameters.contains(key)) { 0461 // qDebug() << "duplicate key" << key << "found, ignoring everything more"; 0462 parameters.remove(key); 0463 return parameters; 0464 } else { 0465 parameters.insert(key, val); 0466 } 0467 } 0468 0469 QMap<QString, QString>::iterator i = contparams.begin(); 0470 while (i != contparams.end()) { 0471 QString key = i.key(); 0472 int spos = key.indexOf(QLatin1Char('*')); 0473 bool hasencoding = false; 0474 0475 if (key.at(spos + 1) != QLatin1Char('0')) { 0476 ++i; 0477 continue; 0478 } 0479 0480 // no leading zeros allowed, so delete the junk 0481 int klen = key.length(); 0482 if (klen > spos + 2) { 0483 // nothing but continuations and encodings may insert * into parameter name 0484 if ((klen > spos + 3) || ((klen == spos + 3) && (key.at(spos + 2) != QLatin1Char('*')))) { 0485 // qDebug() << "removing invalid key " << key << "with val" << i.value() << key.at(spos + 2); 0486 i = contparams.erase(i); 0487 continue; 0488 } 0489 hasencoding = true; 0490 } 0491 0492 int seqnum = 1; 0493 QMap<QString, QString>::iterator partsi; 0494 // we do not need to care about encoding specifications: only the first 0495 // part is allowed to have one 0496 QString val = i.value(); 0497 0498 key.chop(hasencoding ? 2 : 1); 0499 0500 while ((partsi = contparams.find(key + QString::number(seqnum))) != contparams.end()) { 0501 val += partsi.value(); 0502 contparams.erase(partsi); 0503 } 0504 0505 i = contparams.erase(i); 0506 0507 key.chop(1); 0508 if (hasencoding) { 0509 encparams.insert(key, val); 0510 } else { 0511 if (parameters.contains(key)) { 0512 // qDebug() << "duplicate key" << key << "found, ignoring everything more"; 0513 parameters.remove(key); 0514 return parameters; 0515 } 0516 0517 parameters.insert(key, val); 0518 } 0519 } 0520 0521 for (QMap<QString, QString>::iterator i = encparams.begin(); i != encparams.end(); ++i) { 0522 QString val = i.value(); 0523 0524 // RfC 2231 encoded character set in filename 0525 int spos = val.indexOf(QLatin1Char('\'')); 0526 if (spos == -1) { 0527 continue; 0528 } 0529 int npos = val.indexOf(QLatin1Char('\''), spos + 1); 0530 if (npos == -1) { 0531 continue; 0532 } 0533 0534 const QStringView strView(val); 0535 0536 const QByteArray encodedVal = strView.mid(npos + 1).toLatin1(); 0537 0538 if (!isValidPercentEncoding(encodedVal)) { 0539 continue; 0540 } 0541 0542 const QByteArray rawval = QByteArray::fromPercentEncoding(encodedVal); 0543 0544 const QStringView charset = strView.left(spos); 0545 if (charset.isEmpty() || (charset == QLatin1String("us-ascii"))) { 0546 bool valid = true; 0547 for (int j = rawval.length() - 1; (j >= 0) && valid; j--) { 0548 valid = (rawval.at(j) >= 32); 0549 } 0550 0551 if (!valid) { 0552 continue; 0553 } 0554 val = QString::fromLatin1(rawval.constData()); 0555 } else { 0556 QTextCodec *codec = QTextCodec::codecForName(charset.toLatin1()); 0557 if (!codec) { 0558 continue; 0559 } 0560 val = codec->toUnicode(rawval); 0561 } 0562 0563 parameters.insert(i.key(), val); 0564 } 0565 0566 return parameters; 0567 } 0568 0569 static QMap<QString, QString> contentDispositionParser(const QString &disposition) 0570 { 0571 QMap<QString, QString> parameters = contentDispositionParserInternal(disposition); 0572 0573 const QLatin1String fn("filename"); 0574 if (parameters.contains(fn)) { 0575 // Content-Disposition is not allowed to dictate directory 0576 // path, thus we extract the filename only. 0577 const QString val = QDir::toNativeSeparators(parameters[fn]); 0578 int slpos = val.lastIndexOf(QDir::separator()); 0579 0580 if (slpos > -1) { 0581 parameters.insert(fn, val.mid(slpos + 1)); 0582 } 0583 } 0584 0585 return parameters; 0586 }