kmime/src/kmime_parsers.cpp

0001 /*
0002     kmime_parsers.cpp
0003
0004     KMime, the KDE Internet mail/usenet news message library.
0005     SPDX-FileCopyrightText: 2001 the KMime authors.
0006     See file AUTHORS for details
0007
0008     SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010 #include "kmime_parsers_p.h"
0011
0012 #include <QRegularExpression>
0013
0014 using namespace KMime::Parser;
0015
0016 namespace KMime
0017 {
0018 namespace Parser
0019 {
0020
0021 MultiPart::MultiPart(const QByteArray &src, const QByteArray &boundary)
0022     : m_src(src)
0023     , m_boundary(boundary)
0024 {
0025 }
0026
0027 bool MultiPart::parse()
0028 {
0029     QByteArray b = "--" + m_boundary;
0030     QByteArray part;
0031     int pos1 = 0;
0032     int pos2 = 0;
0033     int blen = b.length();
0034
0035     m_parts.clear();
0036
0037     //find the first valid boundary
0038     while (true) {
0039         if ((pos1 = m_src.indexOf(b, pos1)) == -1 || pos1 == 0 ||
0040                 m_src[pos1 - 1] == '\n') { //valid boundary found or no boundary at all
0041             break;
0042         }
0043         pos1 += blen; //boundary found but not valid => skip it;
0044     }
0045
0046     if (pos1 > -1) {
0047         pos1 += blen;
0048         if (m_src[pos1] == '-' && m_src[pos1 + 1] == '-') {
0049             // the only valid boundary is the end-boundary
0050             // this message is *really* broken
0051             pos1 = -1; //we give up
0052         } else if ((pos1 - blen) > 1) {     //preamble present
0053             m_preamble = m_src.left(pos1 - blen - 1);
0054         }
0055     }
0056
0057     while (pos1 > -1 && pos2 > -1) {
0058
0059         //skip the rest of the line for the first boundary - the message-part starts here
0060         if ((pos1 = m_src.indexOf('\n', pos1)) > -1) {
0061             //now search the next linebreak
0062             //now find the next valid boundary
0063             pos2 = ++pos1; //pos1 and pos2 point now to the beginning of the next line after the boundary
0064             while (true) {
0065                 if ((pos2 = m_src.indexOf(b, pos2)) == -1 ||
0066                         m_src[pos2 - 1] == '\n') { //valid boundary or no more boundaries found
0067                     break;
0068                 }
0069                 pos2 += blen; //boundary is invalid => skip it;
0070             }
0071
0072             if (pos2 == -1) {   // no more boundaries found
0073                 part = m_src.mid(pos1, m_src.length() - pos1);   //take the rest of the string
0074                 m_parts.append(part);
0075                 pos1 = -1;
0076                 pos2 = -1; //break;
0077             } else {
0078                 part = m_src.mid(pos1, pos2 - pos1 - 1);   // pos2 - 1 (\n) is part of the boundary (see RFC 2046, section 5.1.1)
0079                 m_parts.append(part);
0080                 pos2 += blen; //pos2 points now to the first character after the boundary
0081                 if (m_src[pos2] == '-' && m_src[pos2 + 1] == '-') { //end-boundary
0082                     pos1 = pos2 + 2; //pos1 points now to the character directly after the end-boundary
0083
0084                     if ((pos1 = m_src.indexOf('\n', pos1)) > -1) {       //skip the rest of this line
0085                         //everything after the end-boundary is considered as the epilouge
0086                         m_epilouge = m_src.mid(pos1 + 1, m_src.length() - pos1 - 1);
0087                     }
0088                     pos1 = -1;
0089                     pos2 = -1; //break
0090                 } else {
0091                     pos1 = pos2; //the search continues ...
0092                 }
0093             }
0094         }
0095     }
0096
0097     return !m_parts.isEmpty();
0098 }
0099
0100 //=============================================================================
0101
0102 NonMimeParser::NonMimeParser(const QByteArray &src) :
0103     m_src(src), m_partNr(-1), m_totalNr(-1)
0104 {
0105 }
0106
0107 NonMimeParser::~NonMimeParser() = default;
0108
0109 /**
0110  * try to guess the mimetype from the file-extension
0111  */
0112
0113 QByteArray NonMimeParser::guessMimeType(const QByteArray &fileName)
0114 {
0115     QByteArray tmp;
0116     QByteArray mimeType;
0117
0118     if (!fileName.isEmpty()) {
0119         int pos = fileName.lastIndexOf('.');
0120         if (pos++ != -1) {
0121             tmp = fileName.mid(pos, fileName.length() - pos).toUpper();
0122             if (tmp == "JPG" || tmp == "JPEG") {
0123                 mimeType = QByteArrayLiteral("image/jpeg");
0124             } else if (tmp == "GIF") {
0125                 mimeType = QByteArrayLiteral("image/gif");
0126             } else if (tmp == "PNG") {
0127                 mimeType = QByteArrayLiteral("image/png");
0128             } else if (tmp == "TIFF" || tmp == "TIF") {
0129                 mimeType = QByteArrayLiteral("image/tiff");
0130             } else if (tmp == "XPM") {
0131                 mimeType = QByteArrayLiteral("image/x-xpixmap");
0132             } else if (tmp == "XBM") {
0133                 mimeType = QByteArrayLiteral("image/x-xbitmap");
0134             } else if (tmp == "BMP") {
0135                 mimeType = QByteArrayLiteral("image/bmp");
0136             } else if (tmp == "TXT" ||
0137                        tmp == "ASC" ||
0138                        tmp == "H" ||
0139                        tmp == "C" ||
0140                        tmp == "CC" ||
0141                        tmp == "CPP") {
0142                 mimeType = QByteArrayLiteral("text/plain");
0143             } else if (tmp == "HTML" || tmp == "HTM") {
0144                 mimeType = QByteArrayLiteral("text/html");
0145             } else {
0146                 mimeType = QByteArrayLiteral("application/octet-stream");
0147             }
0148         } else {
0149             mimeType = QByteArrayLiteral("application/octet-stream");
0150         }
0151     } else {
0152         mimeType = QByteArrayLiteral("application/octet-stream");
0153     }
0154
0155     return mimeType;
0156 }
0157
0158 //==============================================================================
0159
0160 UUEncoded::UUEncoded(const QByteArray &src, const QByteArray &subject) :
0161     NonMimeParser(src), m_subject(subject)
0162 {}
0163
0164 bool UUEncoded::parse()
0165 {
0166     int currentPos = 0;
0167     bool success = true;
0168     bool firstIteration = true;
0169
0170     const auto srcStr = QString::fromLatin1(m_src);
0171     const QRegularExpression beginRegex(QStringLiteral("begin [0-9][0-9][0-9]"));
0172     const QRegularExpression subjectRegex(QStringLiteral("[0-9]+/[0-9]+"));
0173
0174     while (success) {
0175         int beginPos = currentPos;
0176         int uuStart = currentPos;
0177         int endPos = 0;
0178         int lineCount = 0;
0179         int MCount = 0;
0180         int pos = 0;
0181         int len = 0;
0182         bool containsBegin = false;
0183         bool containsEnd = false;
0184         QByteArray tmp;
0185         QByteArray fileName;
0186
0187         if ((beginPos = srcStr.indexOf(beginRegex, currentPos)) > -1 &&
0188                 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) {
0189             containsBegin = true;
0190             uuStart = m_src.indexOf('\n', beginPos);
0191             if (uuStart == -1) {  //no more line breaks found, we give up
0192                 success = false;
0193                 break;
0194             } else {
0195                 uuStart++; //points now at the beginning of the next line
0196             }
0197         } else {
0198             beginPos = currentPos;
0199         }
0200
0201         if ((endPos = m_src.indexOf("\nend", (uuStart > 0) ? uuStart - 1 : 0)) == -1) {
0202             endPos = m_src.length(); //no end found
0203         } else {
0204             containsEnd = true;
0205         }
0206
0207         if ((containsBegin && containsEnd) || firstIteration) {
0208
0209             //printf("beginPos=%d , uuStart=%d , endPos=%d\n", beginPos, uuStart, endPos);
0210             //all lines in a uuencoded text start with 'M'
0211             for (int idx = uuStart; idx < endPos; idx++) {
0212                 if (m_src[idx] == '\n') {
0213                     lineCount++;
0214                     if (idx + 1 < endPos && m_src[idx + 1] == 'M') {
0215                         idx++;
0216                         MCount++;
0217                     }
0218                 }
0219             }
0220
0221             //printf("lineCount=%d , MCount=%d\n", lineCount, MCount);
0222             if (MCount == 0 || (lineCount - MCount) > 10 ||
0223                     ((!containsBegin || !containsEnd) && (MCount < 15))) {
0224                 // harder check for split-articles
0225                 success = false;
0226                 break; //too many "non-M-Lines" found, we give up
0227             }
0228
0229             if ((!containsBegin || !containsEnd) && !m_subject.isNull()) {
0230                 // message may be split up => parse subject
0231                 const auto match =
0232                     subjectRegex.match(QLatin1StringView(m_subject));
0233                 pos = match.capturedStart(0);
0234                 len = match.capturedLength(0);
0235                 if (pos != -1) {
0236                     tmp = m_subject.mid(pos, len);
0237                     pos = tmp.indexOf('/');
0238                     m_partNr = tmp.left(pos).toInt();
0239                     m_totalNr = tmp.right(tmp.length() - pos - 1).toInt();
0240                 } else {
0241                     success = false;
0242                     break; //no "part-numbers" found in the subject, we give up
0243                 }
0244             }
0245
0246             //everything before "begin" is text
0247             if (beginPos > 0) {
0248                 m_text.append(m_src.mid(currentPos, beginPos - currentPos));
0249             }
0250
0251             if (containsBegin) {
0252                 //everything between "begin ### " and the next LF is considered as the filename
0253                 fileName = m_src.mid(beginPos + 10, uuStart - beginPos - 11);
0254             } else {
0255                 fileName = "";
0256             }
0257             m_filenames.append(fileName);
0258             //everything between "begin" and "end" is uuencoded
0259             m_bins.append(m_src.mid(uuStart, endPos - uuStart + 1));
0260             m_mimeTypes.append(guessMimeType(fileName));
0261             firstIteration = false;
0262
0263             int next = m_src.indexOf('\n', endPos + 1);
0264             if (next == -1) {   //no more line breaks found, we give up
0265                 success = false;
0266                 break;
0267             } else {
0268                 next++; //points now at the beginning of the next line
0269             }
0270             currentPos = next;
0271
0272         } else {
0273             success = false;
0274         }
0275     }
0276
0277     // append trailing text part of the article
0278     m_text.append(m_src.right(m_src.length() - currentPos));
0279
0280     return ((!m_bins.isEmpty()) || isPartial());
0281 }
0282
0283 //==============================================================================
0284
0285 YENCEncoded::YENCEncoded(const QByteArray &src) :
0286     NonMimeParser(src)
0287 {
0288 }
0289
0290 bool YENCEncoded::yencMeta(QByteArray &src, const QByteArray &name, int *value)
0291 {
0292     bool found = false;
0293     QByteArray sought = name + '=';
0294
0295     int iPos = src.indexOf(sought);
0296     if (iPos > -1) {
0297         int pos1 = src.indexOf(' ', iPos);
0298         int pos2 = src.indexOf('\r', iPos);
0299         int pos3 = src.indexOf('\t', iPos);
0300         int pos4 = src.indexOf('\n', iPos);
0301         if (pos2 >= 0 && (pos1 < 0 || pos1 > pos2)) {
0302             pos1 = pos2;
0303         }
0304         if (pos3 >= 0 && (pos1 < 0 || pos1 > pos3)) {
0305             pos1 = pos3;
0306         }
0307         if (pos4 >= 0 && (pos1 < 0 || pos1 > pos4)) {
0308             pos1 = pos4;
0309         }
0310         iPos = src.lastIndexOf('=', pos1) + 1;
0311         if (iPos < pos1) {
0312             char c = src.at(iPos);
0313             if (c >= '0' && c <= '9') {
0314                 found = true;
0315                 *value = src.mid(iPos, pos1 - iPos).toInt();
0316             }
0317         }
0318     }
0319     return found;
0320 }
0321
0322 bool YENCEncoded::parse()
0323 {
0324     int currentPos = 0;
0325     bool success = true;
0326     while (success) {
0327         int beginPos = currentPos;
0328         int yencStart = currentPos;
0329         bool containsPart = false;
0330         QByteArray fileName;
0331
0332         if ((beginPos = m_src.indexOf("=ybegin ", currentPos)) > -1 &&
0333                 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) {
0334             yencStart = m_src.indexOf('\n', beginPos);
0335             if (yencStart == -1) {   // no more line breaks found, give up
0336                 success = false;
0337                 break;
0338             } else {
0339                 yencStart++;
0340                 if (m_src.indexOf("=ypart", yencStart) == yencStart) {
0341                     containsPart = true;
0342                     yencStart = m_src.indexOf('\n', yencStart);
0343                     if (yencStart == -1) {
0344                         success = false;
0345                         break;
0346                     }
0347                     yencStart++;
0348                 }
0349             }
0350             // Try to identify yenc meta data
0351
0352             // Filenames can contain any embedded chars until end of line
0353             QByteArray meta = m_src.mid(beginPos, yencStart - beginPos);
0354             int namePos = meta.indexOf("name=");
0355             if (namePos == -1) {
0356                 success = false;
0357                 break;
0358             }
0359             int eolPos = meta.indexOf('\r', namePos);
0360             if (eolPos == -1) {
0361                 eolPos = meta.indexOf('\n', namePos);
0362             }
0363             if (eolPos == -1) {
0364                 success = false;
0365                 break;
0366             }
0367             fileName = meta.mid(namePos + 5, eolPos - (namePos + 5));
0368
0369             // Other metadata is integer
0370             int yencLine;
0371             if (!yencMeta(meta, "line", &yencLine)) {
0372                 success = false;
0373                 break;
0374             }
0375             int yencSize;
0376             if (!yencMeta(meta, "size", &yencSize)) {
0377                 success = false;
0378                 break;
0379             }
0380
0381             int partBegin;
0382             int partEnd;
0383             if (containsPart) {
0384                 if (!yencMeta(meta, "part", &m_partNr)) {
0385                     success = false;
0386                     break;
0387                 }
0388                 if (!yencMeta(meta, "begin", &partBegin) ||
0389                         !yencMeta(meta, "end", &partEnd)) {
0390                     success = false;
0391                     break;
0392                 }
0393                 if (!yencMeta(meta, "total", &m_totalNr)) {
0394                     m_totalNr = m_partNr + 1;
0395                 }
0396                 if (yencSize == partEnd - partBegin + 1) {
0397                     m_totalNr = 1;
0398                 } else {
0399                     yencSize = partEnd - partBegin + 1;
0400                 }
0401             }
0402
0403             // We have a valid yenc header; now we extract the binary data
0404             int totalSize = 0;
0405             int pos = yencStart;
0406             int len = m_src.length();
0407             bool lineStart = true;
0408             int lineLength = 0;
0409             bool containsEnd = false;
0410             QByteArray binary;
0411             binary.resize(yencSize);
0412             while (pos < len) {
0413                 int ch = m_src.at(pos);
0414                 if (ch < 0) {
0415                     ch += 256;
0416                 }
0417                 if (ch == '\r') {
0418                     if (lineLength != yencLine && totalSize != yencSize) {
0419                         break;
0420                     }
0421                     pos++;
0422                 } else if (ch == '\n') {
0423                     lineStart = true;
0424                     lineLength = 0;
0425                     pos++;
0426                 } else {
0427                     if (ch == '=') {
0428                         if (pos + 1 < len) {
0429                             ch = m_src.at(pos + 1);
0430                             if (lineStart && ch == 'y') {
0431                                 containsEnd = true;
0432                                 break;
0433                             }
0434                             pos += 2;
0435                             ch -= 64 + 42;
0436                             if (ch < 0) {
0437                                 ch += 256;
0438                             }
0439                             if (totalSize >= yencSize) {
0440                                 break;
0441                             }
0442                             binary[totalSize++] = ch;
0443                             lineLength++;
0444                         } else {
0445                             break;
0446                         }
0447                     } else {
0448                         ch -= 42;
0449                         if (ch < 0) {
0450                             ch += 256;
0451                         }
0452                         if (totalSize >= yencSize) {
0453                             break;
0454                         }
0455                         binary[totalSize++] = ch;
0456                         lineLength++;
0457                         pos++;
0458                     }
0459                     lineStart = false;
0460                 }
0461             }
0462
0463             if (!containsEnd) {
0464                 success = false;
0465                 break;
0466             }
0467             if (totalSize != yencSize) {
0468                 success = false;
0469                 break;
0470             }
0471
0472             // pos now points to =yend; get end data
0473             eolPos = m_src.indexOf('\n', pos);
0474             if (eolPos == -1) {
0475                 success = false;
0476                 break;
0477             }
0478             meta = m_src.mid(pos, eolPos - pos);
0479             if (!yencMeta(meta, "size", &totalSize)) {
0480                 success = false;
0481                 break;
0482             }
0483             if (totalSize != yencSize) {
0484                 success = false;
0485                 break;
0486             }
0487
0488             m_filenames.append(fileName);
0489             m_mimeTypes.append(guessMimeType(fileName));
0490             m_bins.append(binary);
0491
0492             //everything before "begin" is text
0493             if (beginPos > 0) {
0494                 m_text.append(m_src.mid(currentPos, beginPos - currentPos));
0495             }
0496             currentPos = eolPos + 1;
0497
0498         } else {
0499             success = false;
0500         }
0501     }
0502
0503     // append trailing text part of the article
0504     m_text.append(m_src.right(m_src.length() - currentPos));
0505
0506     return !m_bins.isEmpty();
0507 }
0508
0509 } // namespace Parser
0510
0511 } // namespace KMime