src/core/dataprotocol.cpp

0001 /*
0002     Implementation of the data protocol (rfc 2397)
0003
0004     SPDX-FileCopyrightText: 2002, 2003 Leo Savernik <l.savernik@aon.at>
0005
0006     SPDX-License-Identifier: LGPL-2.0-only
0007 */
0008
0009 #include "dataprotocol_p.h"
0010
0011 #include "global.h"
0012 #include "metadata.h"
0013
0014 #include <QByteArray>
0015 #include <QStringDecoder>
0016
0017 using namespace KIO;
0018
0019 /** structure containing header information */
0020 struct DataHeader {
0021     QString mime_type; // MIME type of content (lowercase)
0022     MetaData attributes; // attribute/value pairs (attribute lowercase,
0023     //  value unchanged)
0024     bool is_base64; // true if data is base64 encoded
0025     QByteArray url; // reference to decoded url
0026     int data_offset; // zero-indexed position within url
0027     // where the real data begins. May point beyond
0028     // the end to indicate that there is no data
0029 };
0030
0031 /** returns the position of the first occurrence of any of the given
0032  * characters @p c1 or comma (',') or semicolon (';') or buf.length()
0033  * if none is contained.
0034  *
0035  * @param buf buffer where to look for c
0036  * @param begin zero-indexed starting position
0037  * @param c1 character to find or '\0' to ignore
0038  */
0039 static int find(const QByteArray &buf, int begin, const char c1)
0040 {
0041     static const char comma = ',';
0042     static const char semicolon = ';';
0043     int pos = begin;
0044     int size = buf.length();
0045     while (pos < size) {
0046         const char ch = buf[pos];
0047         if (ch == comma || ch == semicolon || (c1 != '\0' && ch == c1)) {
0048             break;
0049         }
0050         pos++;
0051     } /*wend*/
0052     return pos;
0053 }
0054
0055 /** extracts the string between the current position @p pos and the first
0056  * occurrence of either @p c1 or comma (',') or semicolon (';') exclusively
0057  * and updates @p pos to point at the found delimiter or at the end of the
0058  * buffer if neither character occurred.
0059  * @param buf buffer where to look for
0060  * @param pos zero-indexed position within buffer
0061  * @param c1 character to find or '\0' to ignore
0062  */
0063 static inline QString extract(const QByteArray &buf, int &pos, const char c1 = '\0')
0064 {
0065     int oldpos = pos;
0066     pos = find(buf, oldpos, c1);
0067     return QString::fromLatin1(buf.mid(oldpos, pos - oldpos));
0068 }
0069
0070 /** ignores all whitespaces
0071  * @param buf buffer to operate on
0072  * @param pos position to shift to first non-whitespace character
0073  *  Upon return @p pos will either point to the first non-whitespace
0074  *  character or to the end of the buffer.
0075  */
0076 static inline void ignoreWS(const QByteArray &buf, int &pos)
0077 {
0078     int size = buf.length();
0079     while (pos < size && (buf[pos] == ' ' || buf[pos] == '\t')) {
0080         ++pos;
0081     }
0082 }
0083
0084 /** parses a quoted string as per rfc 822.
0085  *
0086  * If trailing quote is missing, the whole rest of the buffer is returned.
0087  * @param buf buffer to operate on
0088  * @param pos position pointing to the leading quote
0089  * @return the extracted string. @p pos will be updated to point to the
0090  *  character following the trailing quote.
0091  */
0092 static QString parseQuotedString(const QByteArray &buf, int &pos)
0093 {
0094     int size = buf.length();
0095     QString res;
0096     res.reserve(size); // can't be larger than buf
0097     pos++; // jump over leading quote
0098     bool escaped = false; // if true means next character is literal
0099     bool parsing = true; // true as long as end quote not found
0100     while (parsing && pos < size) {
0101         const QChar ch = QLatin1Char(buf[pos++]);
0102         if (escaped) {
0103             res += ch;
0104             escaped = false;
0105         } else {
0106             switch (ch.unicode()) {
0107             case '"':
0108                 parsing = false;
0109                 break;
0110             case '\\':
0111                 escaped = true;
0112                 break;
0113             default:
0114                 res += ch;
0115                 break;
0116             } /*end switch*/
0117         } /*end if*/
0118     } /*wend*/
0119     res.squeeze();
0120     return res;
0121 }
0122
0123 /** parses the header of a data url
0124  * @param url the data url
0125  * @param mimeOnly if the only interesting information is the MIME type
0126  * @return DataHeader structure with the header information
0127  */
0128 static DataHeader parseDataHeader(const QUrl &url, const bool mimeOnly)
0129 {
0130     DataHeader header_info;
0131
0132     // initialize header info members
0133     header_info.mime_type = QStringLiteral("text/plain");
0134     header_info.attributes.insert(QStringLiteral("charset"), QStringLiteral("us-ascii"));
0135     header_info.is_base64 = false;
0136
0137     // decode url and save it
0138     const QByteArray &raw_url = header_info.url = QByteArray::fromPercentEncoding(url.path(QUrl::FullyEncoded).toLatin1());
0139     const int raw_url_len = raw_url.length();
0140
0141     header_info.data_offset = 0;
0142
0143     // read MIME type
0144     if (raw_url_len == 0) {
0145         return header_info;
0146     }
0147     const QString mime_type = extract(raw_url, header_info.data_offset).trimmed();
0148     if (!mime_type.isEmpty()) {
0149         header_info.mime_type = mime_type;
0150     }
0151     if (mimeOnly) {
0152         return header_info;
0153     }
0154
0155     if (header_info.data_offset >= raw_url_len) {
0156         return header_info;
0157     }
0158     // jump over delimiter token and return if data reached
0159     if (raw_url[header_info.data_offset++] == ',') {
0160         return header_info;
0161     }
0162
0163     // read all attributes and store them
0164     bool data_begin_reached = false;
0165     while (!data_begin_reached && header_info.data_offset < raw_url_len) {
0166         // read attribute
0167         const QString attribute = extract(raw_url, header_info.data_offset, '=').trimmed();
0168         if (header_info.data_offset >= raw_url_len || raw_url[header_info.data_offset] != '=') {
0169             // no assignment, must be base64 option
0170             if (attribute == QLatin1String("base64")) {
0171                 header_info.is_base64 = true;
0172             }
0173         } else {
0174             header_info.data_offset++; // jump over '=' token
0175
0176             // read value
0177             ignoreWS(raw_url, header_info.data_offset);
0178             if (header_info.data_offset >= raw_url_len) {
0179                 return header_info;
0180             }
0181
0182             QString value;
0183             if (raw_url[header_info.data_offset] == '"') {
0184                 value = parseQuotedString(raw_url, header_info.data_offset);
0185                 ignoreWS(raw_url, header_info.data_offset);
0186             } else {
0187                 value = extract(raw_url, header_info.data_offset).trimmed();
0188             }
0189
0190             // add attribute to map
0191             header_info.attributes[attribute.toLower()] = value;
0192
0193         } /*end if*/
0194         if (header_info.data_offset < raw_url_len && raw_url[header_info.data_offset] == ',') {
0195             data_begin_reached = true;
0196         }
0197         header_info.data_offset++; // jump over separator token
0198     } /*wend*/
0199
0200     return header_info;
0201 }
0202
0203 DataProtocol::DataProtocol()
0204 {
0205 }
0206
0207 DataProtocol::~DataProtocol() = default;
0208
0209 void DataProtocol::get(const QUrl &url)
0210 {
0211     ref();
0212     // qDebug() << this;
0213
0214     const DataHeader hdr = parseDataHeader(url, false);
0215
0216     const int size = hdr.url.length();
0217     const int data_ofs = qMin(hdr.data_offset, size);
0218     // FIXME: string is copied, would be nice if we could have a reference only
0219     const QByteArray url_data = hdr.url.mid(data_ofs);
0220     QByteArray outData;
0221
0222     if (hdr.is_base64) {
0223         // base64 stuff is expected to contain the correct charset, so we just
0224         // decode it and pass it to the receiver
0225         outData = QByteArray::fromBase64(url_data);
0226     } else {
0227         QStringDecoder codec(hdr.attributes[QStringLiteral("charset")].toLatin1().constData());
0228         if (codec.isValid()) {
0229             outData = QString(codec.decode(url_data)).toUtf8();
0230         } else {
0231             outData = url_data;
0232         } /*end if*/
0233     } /*end if*/
0234
0235     // qDebug() << "emit mimeType@"<<this;
0236     Q_EMIT mimeType(hdr.mime_type);
0237     // qDebug() << "emit totalSize@"<<this;
0238     Q_EMIT totalSize(outData.size());
0239
0240     // qDebug() << "emit setMetaData@"<<this;
0241     setAllMetaData(hdr.attributes);
0242
0243     // qDebug() << "emit sendMetaData@"<<this;
0244     sendMetaData();
0245     // qDebug() << "(1) queue size " << dispatchQueue.size();
0246     // empiric studies have shown that this shouldn't be queued & dispatched
0247     Q_EMIT data(outData);
0248     // qDebug() << "(2) queue size " << dispatchQueue.size();
0249     dispatch_data(QByteArray{});
0250     // qDebug() << "(3) queue size " << dispatchQueue.size();
0251     dispatch_finished();
0252     // qDebug() << "(4) queue size " << dispatchQueue.size();
0253     deref();
0254 }
0255
0256 /* --------------------------------------------------------------------- */
0257
0258 void DataProtocol::mimetype(const QUrl &url)
0259 {
0260     ref();
0261     Q_EMIT mimeType(parseDataHeader(url, true).mime_type);
0262     Q_EMIT finished();
0263     deref();
0264 }
0265
0266 /* --------------------------------------------------------------------- */
0267
0268 #if !defined(TESTKIO)
0269 #include "moc_dataprotocol_p.cpp"
0270 #endif