kmime/src/kmime_parsers.cpp

0001 /*
0002     kmime_parsers.cpp
0003
0004     KMime, the KDE Internet mail/usenet news message library.
0005     SPDX-FileCopyrightText: 2001 the KMime authors.
0006     See file AUTHORS for details
0007
0008     SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010 #include "kmime_parsers_p.h"
0011
0012 #include <QRegularExpression>
0013
0014 using namespace KMime::Parser;
0015
0016 namespace KMime
0017 {
0018 namespace Parser
0019 {
0020
0021 MultiPart::MultiPart(const QByteArray &src, const QByteArray &boundary)
0022     : m_src(src)
0023     , m_boundary(boundary)
0024 {
0025 }
0026
0027 bool MultiPart::parse()
0028 {
0029     QByteArray b = "--" + m_boundary;
0030     QByteArray part;
0031     int pos1 = 0;
0032     int pos2 = 0;
0033     int blen = b.length();
0034
0035     m_parts.clear();
0036
0037     //find the first valid boundary
0038     while (true) {
0039         if ((pos1 = m_src.indexOf(b, pos1)) == -1 || pos1 == 0 ||
0040                 m_src[pos1 - 1] == '\n') { //valid boundary found or no boundary at all
0041             break;
0042         }
0043         pos1 += blen; //boundary found but not valid => skip it;
0044     }
0045
0046     if (pos1 > -1) {
0047         pos1 += blen;
0048         if (m_src[pos1] == '-' && m_src[pos1 + 1] == '-') {
0049             // the only valid boundary is the end-boundary
0050             // this message is *really* broken
0051             pos1 = -1; //we give up
0052         } else if ((pos1 - blen) > 1) {     //preamble present
0053             m_preamble = m_src.left(pos1 - blen - 1);
0054         }
0055     }
0056
0057     while (pos1 > -1 && pos2 > -1) {
0058
0059         //skip the rest of the line for the first boundary - the message-part starts here
0060         if ((pos1 = m_src.indexOf('\n', pos1)) > -1) {
0061             //now search the next linebreak
0062             //now find the next valid boundary
0063             pos2 = ++pos1; //pos1 and pos2 point now to the beginning of the next line after the boundary
0064             while (true) {
0065                 if ((pos2 = m_src.indexOf(b, pos2)) == -1 ||
0066                         m_src[pos2 - 1] == '\n') { //valid boundary or no more boundaries found
0067                     break;
0068                 }
0069                 pos2 += blen; //boundary is invalid => skip it;
0070             }
0071
0072             if (pos2 == -1) {   // no more boundaries found
0073                 part = m_src.mid(pos1, m_src.length() - pos1);   //take the rest of the string
0074                 m_parts.append(part);
0075                 pos1 = -1;
0076                 pos2 = -1; //break;
0077             } else {
0078                 part = m_src.mid(pos1, pos2 - pos1 - 1);   // pos2 - 1 (\n) is part of the boundary (see RFC 2046, section 5.1.1)
0079                 m_parts.append(part);
0080                 pos2 += blen; //pos2 points now to the first character after the boundary
0081                 if (m_src[pos2] == '-' && m_src[pos2 + 1] == '-') { //end-boundary
0082                     pos1 = pos2 + 2; //pos1 points now to the character directly after the end-boundary
0083
0084                     if ((pos1 = m_src.indexOf('\n', pos1)) > -1) {       //skip the rest of this line
0085                         //everything after the end-boundary is considered as the epilouge
0086                         m_epilouge = m_src.mid(pos1 + 1, m_src.length() - pos1 - 1);
0087                     }
0088                     pos1 = -1;
0089                     pos2 = -1; //break
0090                 } else {
0091                     pos1 = pos2; //the search continues ...
0092                 }
0093             }
0094         }
0095     }
0096
0097     return !m_parts.isEmpty();
0098 }
0099
0100 //=============================================================================
0101
0102 NonMimeParser::NonMimeParser(const QByteArray &src) :
0103     m_src(src), m_partNr(-1), m_totalNr(-1)
0104 {
0105 }
0106
0107 NonMimeParser::~NonMimeParser() = default;
0108
0109 /**
0110  * try to guess the mimetype from the file-extension
0111  */
0112
0113 QByteArray NonMimeParser::guessMimeType(const QByteArray &fileName)
0114 {
0115     QByteArray tmp;
0116     QByteArray mimeType;
0117
0118     if (!fileName.isEmpty()) {
0119         int pos = fileName.lastIndexOf('.');
0120         if (pos++ != -1) {
0121             tmp = fileName.mid(pos, fileName.length() - pos).toUpper();
0122             if (tmp == "JPG" || tmp == "JPEG") {
0123                 mimeType = QByteArrayLiteral("image/jpeg");
0124             } else if (tmp == "GIF") {
0125                 mimeType = QByteArrayLiteral("image/gif");
0126             } else if (tmp == "PNG") {
0127                 mimeType = QByteArrayLiteral("image/png");
0128             } else if (tmp == "TIFF" || tmp == "TIF") {
0129                 mimeType = QByteArrayLiteral("image/tiff");
0130             } else if (tmp == "XPM") {
0131                 mimeType = QByteArrayLiteral("image/x-xpixmap");
0132             } else if (tmp == "XBM") {
0133                 mimeType = QByteArrayLiteral("image/x-xbitmap");
0134             } else if (tmp == "BMP") {
0135                 mimeType = QByteArrayLiteral("image/bmp");
0136             } else if (tmp == "TXT" ||
0137                        tmp == "ASC" ||
0138                        tmp == "H" ||
0139                        tmp == "C" ||
0140                        tmp == "CC" ||
0141                        tmp == "CPP") {
0142                 mimeType = QByteArrayLiteral("text/plain");
0143             } else if (tmp == "HTML" || tmp == "HTM") {
0144                 mimeType = QByteArrayLiteral("text/html");
0145             } else {
0146                 mimeType = QByteArrayLiteral("application/octet-stream");
0147             }
0148         } else {
0149             mimeType = QByteArrayLiteral("application/octet-stream");
0150         }
0151     } else {
0152         mimeType = QByteArrayLiteral("application/octet-stream");
0153     }
0154
0155     return mimeType;
0156 }
0157
0158 //==============================================================================
0159
0160 UUEncoded::UUEncoded(const QByteArray &src, const QByteArray &subject) :
0161     NonMimeParser(src), m_subject(subject)
0162 {}
0163
0164 bool UUEncoded::parse()
0165 {
0166     int currentPos = 0;
0167     bool success = true;
0168     bool firstIteration = true;
0169
0170     const auto srcStr = QString::fromLatin1(m_src);
0171     const QRegularExpression beginRegex(QStringLiteral("begin [0-9][0-9][0-9]"));
0172     const QRegularExpression subjectRegex(QStringLiteral("[0-9]+/[0-9]+"));
0173
0174     while (success) {
0175         int beginPos = currentPos;
0176         int uuStart = currentPos;
0177         int endPos = 0;
0178         int lineCount = 0;
0179         int MCount = 0;
0180         int pos = 0;
0181         int len = 0;
0182         bool containsBegin = false;
0183         bool containsEnd = false;
0184         QByteArray tmp;
0185         QByteArray fileName;
0186
0187         if ((beginPos = srcStr.indexOf(beginRegex, currentPos)) > -1 &&
0188                 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) {
0189             containsBegin = true;
0190             uuStart = m_src.indexOf('\n', beginPos);
0191             if (uuStart == -1) {  //no more line breaks found, we give up
0192                 success = false;
0193                 break;
0194             } else {
0195                 uuStart++; //points now at the beginning of the next line
0196             }
0197         } else {
0198             beginPos = currentPos;
0199         }
0200
0201         if ((endPos = m_src.indexOf("\nend", (uuStart > 0) ? uuStart - 1 : 0)) == -1) {
0202             endPos = m_src.length(); //no end found
0203         } else {
0204             containsEnd = true;
0205         }
0206
0207         if ((containsBegin && containsEnd) || firstIteration) {
0208
0209             //printf("beginPos=%d , uuStart=%d , endPos=%d\n", beginPos, uuStart, endPos);
0210             //all lines in a uuencoded text start with 'M'
0211             for (int idx = uuStart; idx < endPos; idx++) {
0212                 if (m_src[idx] == '\n') {
0213                     lineCount++;
0214                     if (idx + 1 < endPos && m_src[idx + 1] == 'M') {
0215                         idx++;
0216                         MCount++;
0217                     }
0218                 }
0219             }
0220
0221             //printf("lineCount=%d , MCount=%d\n", lineCount, MCount);
0222             if (MCount == 0 || (lineCount - MCount) > 10 ||
0223                     ((!containsBegin || !containsEnd) && (MCount < 15))) {
0224                 // harder check for split-articles
0225                 success = false;
0226                 break; //too many "non-M-Lines" found, we give up
0227             }
0228
0229             if ((!containsBegin || !containsEnd) && !m_subject.isNull()) {
0230                 // message may be split up => parse subject
0231                 const auto match = subjectRegex.match(QLatin1String(m_subject));
0232                 pos = match.capturedStart(0);
0233                 len = match.capturedLength(0);
0234                 if (pos != -1) {
0235                     tmp = m_subject.mid(pos, len);
0236                     pos = tmp.indexOf('/');
0237                     m_partNr = tmp.left(pos).toInt();
0238                     m_totalNr = tmp.right(tmp.length() - pos - 1).toInt();
0239                 } else {
0240                     success = false;
0241                     break; //no "part-numbers" found in the subject, we give up
0242                 }
0243             }
0244
0245             //everything before "begin" is text
0246             if (beginPos > 0) {
0247                 m_text.append(m_src.mid(currentPos, beginPos - currentPos));
0248             }
0249
0250             if (containsBegin) {
0251                 //everything between "begin ### " and the next LF is considered as the filename
0252                 fileName = m_src.mid(beginPos + 10, uuStart - beginPos - 11);
0253             } else {
0254                 fileName = "";
0255             }
0256             m_filenames.append(fileName);
0257             //everything between "begin" and "end" is uuencoded
0258             m_bins.append(m_src.mid(uuStart, endPos - uuStart + 1));
0259             m_mimeTypes.append(guessMimeType(fileName));
0260             firstIteration = false;
0261
0262             int next = m_src.indexOf('\n', endPos + 1);
0263             if (next == -1) {   //no more line breaks found, we give up
0264                 success = false;
0265                 break;
0266             } else {
0267                 next++; //points now at the beginning of the next line
0268             }
0269             currentPos = next;
0270
0271         } else {
0272             success = false;
0273         }
0274     }
0275
0276     // append trailing text part of the article
0277     m_text.append(m_src.right(m_src.length() - currentPos));
0278
0279     return ((!m_bins.isEmpty()) || isPartial());
0280 }
0281
0282 //==============================================================================
0283
0284 YENCEncoded::YENCEncoded(const QByteArray &src) :
0285     NonMimeParser(src)
0286 {
0287 }
0288
0289 bool YENCEncoded::yencMeta(QByteArray &src, const QByteArray &name, int *value)
0290 {
0291     bool found = false;
0292     QByteArray sought = name + '=';
0293
0294     int iPos = src.indexOf(sought);
0295     if (iPos > -1) {
0296         int pos1 = src.indexOf(' ', iPos);
0297         int pos2 = src.indexOf('\r', iPos);
0298         int pos3 = src.indexOf('\t', iPos);
0299         int pos4 = src.indexOf('\n', iPos);
0300         if (pos2 >= 0 && (pos1 < 0 || pos1 > pos2)) {
0301             pos1 = pos2;
0302         }
0303         if (pos3 >= 0 && (pos1 < 0 || pos1 > pos3)) {
0304             pos1 = pos3;
0305         }
0306         if (pos4 >= 0 && (pos1 < 0 || pos1 > pos4)) {
0307             pos1 = pos4;
0308         }
0309         iPos = src.lastIndexOf('=', pos1) + 1;
0310         if (iPos < pos1) {
0311             char c = src.at(iPos);
0312             if (c >= '0' && c <= '9') {
0313                 found = true;
0314                 *value = src.mid(iPos, pos1 - iPos).toInt();
0315             }
0316         }
0317     }
0318     return found;
0319 }
0320
0321 bool YENCEncoded::parse()
0322 {
0323     int currentPos = 0;
0324     bool success = true;
0325     while (success) {
0326         int beginPos = currentPos;
0327         int yencStart = currentPos;
0328         bool containsPart = false;
0329         QByteArray fileName;
0330
0331         if ((beginPos = m_src.indexOf("=ybegin ", currentPos)) > -1 &&
0332                 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) {
0333             yencStart = m_src.indexOf('\n', beginPos);
0334             if (yencStart == -1) {   // no more line breaks found, give up
0335                 success = false;
0336                 break;
0337             } else {
0338                 yencStart++;
0339                 if (m_src.indexOf("=ypart", yencStart) == yencStart) {
0340                     containsPart = true;
0341                     yencStart = m_src.indexOf('\n', yencStart);
0342                     if (yencStart == -1) {
0343                         success = false;
0344                         break;
0345                     }
0346                     yencStart++;
0347                 }
0348             }
0349             // Try to identify yenc meta data
0350
0351             // Filenames can contain any embedded chars until end of line
0352             QByteArray meta = m_src.mid(beginPos, yencStart - beginPos);
0353             int namePos = meta.indexOf("name=");
0354             if (namePos == -1) {
0355                 success = false;
0356                 break;
0357             }
0358             int eolPos = meta.indexOf('\r', namePos);
0359             if (eolPos == -1) {
0360                 eolPos = meta.indexOf('\n', namePos);
0361             }
0362             if (eolPos == -1) {
0363                 success = false;
0364                 break;
0365             }
0366             fileName = meta.mid(namePos + 5, eolPos - (namePos + 5));
0367
0368             // Other metadata is integer
0369             int yencLine;
0370             if (!yencMeta(meta, "line", &yencLine)) {
0371                 success = false;
0372                 break;
0373             }
0374             int yencSize;
0375             if (!yencMeta(meta, "size", &yencSize)) {
0376                 success = false;
0377                 break;
0378             }
0379
0380             int partBegin;
0381             int partEnd;
0382             if (containsPart) {
0383                 if (!yencMeta(meta, "part", &m_partNr)) {
0384                     success = false;
0385                     break;
0386                 }
0387                 if (!yencMeta(meta, "begin", &partBegin) ||
0388                         !yencMeta(meta, "end", &partEnd)) {
0389                     success = false;
0390                     break;
0391                 }
0392                 if (!yencMeta(meta, "total", &m_totalNr)) {
0393                     m_totalNr = m_partNr + 1;
0394                 }
0395                 if (yencSize == partEnd - partBegin + 1) {
0396                     m_totalNr = 1;
0397                 } else {
0398                     yencSize = partEnd - partBegin + 1;
0399                 }
0400             }
0401
0402             // We have a valid yenc header; now we extract the binary data
0403             int totalSize = 0;
0404             int pos = yencStart;
0405             int len = m_src.length();
0406             bool lineStart = true;
0407             int lineLength = 0;
0408             bool containsEnd = false;
0409             QByteArray binary;
0410             binary.resize(yencSize);
0411             while (pos < len) {
0412                 int ch = m_src.at(pos);
0413                 if (ch < 0) {
0414                     ch += 256;
0415                 }
0416                 if (ch == '\r') {
0417                     if (lineLength != yencLine && totalSize != yencSize) {
0418                         break;
0419                     }
0420                     pos++;
0421                 } else if (ch == '\n') {
0422                     lineStart = true;
0423                     lineLength = 0;
0424                     pos++;
0425                 } else {
0426                     if (ch == '=') {
0427                         if (pos + 1 < len) {
0428                             ch = m_src.at(pos + 1);
0429                             if (lineStart && ch == 'y') {
0430                                 containsEnd = true;
0431                                 break;
0432                             }
0433                             pos += 2;
0434                             ch -= 64 + 42;
0435                             if (ch < 0) {
0436                                 ch += 256;
0437                             }
0438                             if (totalSize >= yencSize) {
0439                                 break;
0440                             }
0441                             binary[totalSize++] = ch;
0442                             lineLength++;
0443                         } else {
0444                             break;
0445                         }
0446                     } else {
0447                         ch -= 42;
0448                         if (ch < 0) {
0449                             ch += 256;
0450                         }
0451                         if (totalSize >= yencSize) {
0452                             break;
0453                         }
0454                         binary[totalSize++] = ch;
0455                         lineLength++;
0456                         pos++;
0457                     }
0458                     lineStart = false;
0459                 }
0460             }
0461
0462             if (!containsEnd) {
0463                 success = false;
0464                 break;
0465             }
0466             if (totalSize != yencSize) {
0467                 success = false;
0468                 break;
0469             }
0470
0471             // pos now points to =yend; get end data
0472             eolPos = m_src.indexOf('\n', pos);
0473             if (eolPos == -1) {
0474                 success = false;
0475                 break;
0476             }
0477             meta = m_src.mid(pos, eolPos - pos);
0478             if (!yencMeta(meta, "size", &totalSize)) {
0479                 success = false;
0480                 break;
0481             }
0482             if (totalSize != yencSize) {
0483                 success = false;
0484                 break;
0485             }
0486
0487             m_filenames.append(fileName);
0488             m_mimeTypes.append(guessMimeType(fileName));
0489             m_bins.append(binary);
0490
0491             //everything before "begin" is text
0492             if (beginPos > 0) {
0493                 m_text.append(m_src.mid(currentPos, beginPos - currentPos));
0494             }
0495             currentPos = eolPos + 1;
0496
0497         } else {
0498             success = false;
0499         }
0500     }
0501
0502     // append trailing text part of the article
0503     m_text.append(m_src.right(m_src.length() - currentPos));
0504
0505     return !m_bins.isEmpty();
0506 }
0507
0508 } // namespace Parser
0509
0510 } // namespace KMime