src/core/dataprotocol.cpp

0001 /*
0002     Implementation of the data protocol (rfc 2397)
0003
0004     SPDX-FileCopyrightText: 2002, 2003 Leo Savernik <l.savernik@aon.at>
0005
0006     SPDX-License-Identifier: LGPL-2.0-only
0007 */
0008
0009 #include "dataprotocol_p.h"
0010
0011 #include "global.h"
0012
0013 #include <QByteArray>
0014 #include <QTextCodec>
0015
0016 using namespace KIO;
0017
0018 /** structure containing header information */
0019 struct DataHeader {
0020     QString mime_type; // MIME type of content (lowercase)
0021     MetaData attributes; // attribute/value pairs (attribute lowercase,
0022     //  value unchanged)
0023     bool is_base64; // true if data is base64 encoded
0024     QByteArray url; // reference to decoded url
0025     int data_offset; // zero-indexed position within url
0026     // where the real data begins. May point beyond
0027     // the end to indicate that there is no data
0028 };
0029
0030 /** returns the position of the first occurrence of any of the given
0031  * characters @p c1 or comma (',') or semicolon (';') or buf.length()
0032  * if none is contained.
0033  *
0034  * @param buf buffer where to look for c
0035  * @param begin zero-indexed starting position
0036  * @param c1 character to find or '\0' to ignore
0037  */
0038 static int find(const QByteArray &buf, int begin, const char c1)
0039 {
0040     static const char comma = ',';
0041     static const char semicolon = ';';
0042     int pos = begin;
0043     int size = buf.length();
0044     while (pos < size) {
0045         const char ch = buf[pos];
0046         if (ch == comma || ch == semicolon || (c1 != '\0' && ch == c1)) {
0047             break;
0048         }
0049         pos++;
0050     } /*wend*/
0051     return pos;
0052 }
0053
0054 /** extracts the string between the current position @p pos and the first
0055  * occurrence of either @p c1 or comma (',') or semicolon (';') exclusively
0056  * and updates @p pos to point at the found delimiter or at the end of the
0057  * buffer if neither character occurred.
0058  * @param buf buffer where to look for
0059  * @param pos zero-indexed position within buffer
0060  * @param c1 character to find or '\0' to ignore
0061  */
0062 static inline QString extract(const QByteArray &buf, int &pos, const char c1 = '\0')
0063 {
0064     int oldpos = pos;
0065     pos = find(buf, oldpos, c1);
0066     return QString::fromLatin1(buf.mid(oldpos, pos - oldpos));
0067 }
0068
0069 /** ignores all whitespaces
0070  * @param buf buffer to operate on
0071  * @param pos position to shift to first non-whitespace character
0072  *  Upon return @p pos will either point to the first non-whitespace
0073  *  character or to the end of the buffer.
0074  */
0075 static inline void ignoreWS(const QByteArray &buf, int &pos)
0076 {
0077     int size = buf.length();
0078     while (pos < size && (buf[pos] == ' ' || buf[pos] == '\t')) {
0079         ++pos;
0080     }
0081 }
0082
0083 /** parses a quoted string as per rfc 822.
0084  *
0085  * If trailing quote is missing, the whole rest of the buffer is returned.
0086  * @param buf buffer to operate on
0087  * @param pos position pointing to the leading quote
0088  * @return the extracted string. @p pos will be updated to point to the
0089  *  character following the trailing quote.
0090  */
0091 static QString parseQuotedString(const QByteArray &buf, int &pos)
0092 {
0093     int size = buf.length();
0094     QString res;
0095     res.reserve(size); // can't be larger than buf
0096     pos++; // jump over leading quote
0097     bool escaped = false; // if true means next character is literal
0098     bool parsing = true; // true as long as end quote not found
0099     while (parsing && pos < size) {
0100         const QChar ch = QLatin1Char(buf[pos++]);
0101         if (escaped) {
0102             res += ch;
0103             escaped = false;
0104         } else {
0105             switch (ch.unicode()) {
0106             case '"':
0107                 parsing = false;
0108                 break;
0109             case '\\':
0110                 escaped = true;
0111                 break;
0112             default:
0113                 res += ch;
0114                 break;
0115             } /*end switch*/
0116         } /*end if*/
0117     } /*wend*/
0118     res.squeeze();
0119     return res;
0120 }
0121
0122 /** parses the header of a data url
0123  * @param url the data url
0124  * @param mimeOnly if the only interesting information is the MIME type
0125  * @return DataHeader structure with the header information
0126  */
0127 static DataHeader parseDataHeader(const QUrl &url, const bool mimeOnly)
0128 {
0129     DataHeader header_info;
0130
0131     // initialize header info members
0132     header_info.mime_type = QStringLiteral("text/plain");
0133     header_info.attributes.insert(QStringLiteral("charset"), QStringLiteral("us-ascii"));
0134     header_info.is_base64 = false;
0135
0136     // decode url and save it
0137     const QByteArray &raw_url = header_info.url = QByteArray::fromPercentEncoding(url.path(QUrl::FullyEncoded).toLatin1());
0138     const int raw_url_len = raw_url.length();
0139
0140     header_info.data_offset = 0;
0141
0142     // read MIME type
0143     if (raw_url_len == 0) {
0144         return header_info;
0145     }
0146     const QString mime_type = extract(raw_url, header_info.data_offset).trimmed();
0147     if (!mime_type.isEmpty()) {
0148         header_info.mime_type = mime_type;
0149     }
0150     if (mimeOnly) {
0151         return header_info;
0152     }
0153
0154     if (header_info.data_offset >= raw_url_len) {
0155         return header_info;
0156     }
0157     // jump over delimiter token and return if data reached
0158     if (raw_url[header_info.data_offset++] == ',') {
0159         return header_info;
0160     }
0161
0162     // read all attributes and store them
0163     bool data_begin_reached = false;
0164     while (!data_begin_reached && header_info.data_offset < raw_url_len) {
0165         // read attribute
0166         const QString attribute = extract(raw_url, header_info.data_offset, '=').trimmed();
0167         if (header_info.data_offset >= raw_url_len || raw_url[header_info.data_offset] != '=') {
0168             // no assignment, must be base64 option
0169             if (attribute == QLatin1String("base64")) {
0170                 header_info.is_base64 = true;
0171             }
0172         } else {
0173             header_info.data_offset++; // jump over '=' token
0174
0175             // read value
0176             ignoreWS(raw_url, header_info.data_offset);
0177             if (header_info.data_offset >= raw_url_len) {
0178                 return header_info;
0179             }
0180
0181             QString value;
0182             if (raw_url[header_info.data_offset] == '"') {
0183                 value = parseQuotedString(raw_url, header_info.data_offset);
0184                 ignoreWS(raw_url, header_info.data_offset);
0185             } else {
0186                 value = extract(raw_url, header_info.data_offset).trimmed();
0187             }
0188
0189             // add attribute to map
0190             header_info.attributes[attribute.toLower()] = value;
0191
0192         } /*end if*/
0193         if (header_info.data_offset < raw_url_len && raw_url[header_info.data_offset] == ',') {
0194             data_begin_reached = true;
0195         }
0196         header_info.data_offset++; // jump over separator token
0197     } /*wend*/
0198
0199     return header_info;
0200 }
0201
0202 DataProtocol::DataProtocol()
0203 {
0204 }
0205
0206 DataProtocol::~DataProtocol() = default;
0207
0208 void DataProtocol::get(const QUrl &url)
0209 {
0210     ref();
0211     // qDebug() << this;
0212
0213     const DataHeader hdr = parseDataHeader(url, false);
0214
0215     const int size = hdr.url.length();
0216     const int data_ofs = qMin(hdr.data_offset, size);
0217     // FIXME: string is copied, would be nice if we could have a reference only
0218     const QByteArray url_data = hdr.url.mid(data_ofs);
0219     QByteArray outData;
0220
0221     if (hdr.is_base64) {
0222         // base64 stuff is expected to contain the correct charset, so we just
0223         // decode it and pass it to the receiver
0224         outData = QByteArray::fromBase64(url_data);
0225     } else {
0226         QTextCodec *codec = QTextCodec::codecForName(hdr.attributes[QStringLiteral("charset")].toLatin1());
0227         if (codec != nullptr) {
0228             outData = codec->toUnicode(url_data).toUtf8();
0229         } else {
0230             outData = url_data;
0231         } /*end if*/
0232     } /*end if*/
0233
0234     // qDebug() << "emit mimeType@"<<this;
0235     Q_EMIT mimeType(hdr.mime_type);
0236     // qDebug() << "emit totalSize@"<<this;
0237     Q_EMIT totalSize(outData.size());
0238
0239     // qDebug() << "emit setMetaData@"<<this;
0240     setAllMetaData(hdr.attributes);
0241
0242     // qDebug() << "emit sendMetaData@"<<this;
0243     sendMetaData();
0244     // qDebug() << "(1) queue size " << dispatchQueue.size();
0245     // empiric studies have shown that this shouldn't be queued & dispatched
0246     Q_EMIT data(outData);
0247     // qDebug() << "(2) queue size " << dispatchQueue.size();
0248     dispatch_data(QByteArray{});
0249     // qDebug() << "(3) queue size " << dispatchQueue.size();
0250     dispatch_finished();
0251     // qDebug() << "(4) queue size " << dispatchQueue.size();
0252     deref();
0253 }
0254
0255 /* --------------------------------------------------------------------- */
0256
0257 void DataProtocol::mimetype(const QUrl &url)
0258 {
0259     ref();
0260     Q_EMIT mimeType(parseDataHeader(url, true).mime_type);
0261     Q_EMIT finished();
0262     deref();
0263 }
0264
0265 /* --------------------------------------------------------------------- */
0266
0267 #if !defined(TESTKIO)
0268 #include "moc_dataprotocol_p.cpp"
0269 #endif