File indexing completed on 2024-04-21 16:06:19
0001 /* 0002 kmime_parsers.cpp 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001 the KMime authors. 0006 See file AUTHORS for details 0007 0008 SPDX-License-Identifier: LGPL-2.0-or-later 0009 */ 0010 #include "kmime_parsers_p.h" 0011 0012 #include <QRegularExpression> 0013 0014 using namespace KMime::Parser; 0015 0016 namespace KMime 0017 { 0018 namespace Parser 0019 { 0020 0021 MultiPart::MultiPart(const QByteArray &src, const QByteArray &boundary) 0022 : m_src(src) 0023 , m_boundary(boundary) 0024 { 0025 } 0026 0027 bool MultiPart::parse() 0028 { 0029 QByteArray b = "--" + m_boundary; 0030 QByteArray part; 0031 int pos1 = 0; 0032 int pos2 = 0; 0033 int blen = b.length(); 0034 0035 m_parts.clear(); 0036 0037 //find the first valid boundary 0038 while (true) { 0039 if ((pos1 = m_src.indexOf(b, pos1)) == -1 || pos1 == 0 || 0040 m_src[pos1 - 1] == '\n') { //valid boundary found or no boundary at all 0041 break; 0042 } 0043 pos1 += blen; //boundary found but not valid => skip it; 0044 } 0045 0046 if (pos1 > -1) { 0047 pos1 += blen; 0048 if (m_src[pos1] == '-' && m_src[pos1 + 1] == '-') { 0049 // the only valid boundary is the end-boundary 0050 // this message is *really* broken 0051 pos1 = -1; //we give up 0052 } else if ((pos1 - blen) > 1) { //preamble present 0053 m_preamble = m_src.left(pos1 - blen - 1); 0054 } 0055 } 0056 0057 while (pos1 > -1 && pos2 > -1) { 0058 0059 //skip the rest of the line for the first boundary - the message-part starts here 0060 if ((pos1 = m_src.indexOf('\n', pos1)) > -1) { 0061 //now search the next linebreak 0062 //now find the next valid boundary 0063 pos2 = ++pos1; //pos1 and pos2 point now to the beginning of the next line after the boundary 0064 while (true) { 0065 if ((pos2 = m_src.indexOf(b, pos2)) == -1 || 0066 m_src[pos2 - 1] == '\n') { //valid boundary or no more boundaries found 0067 break; 0068 } 0069 pos2 += blen; //boundary is invalid => skip it; 0070 } 0071 0072 if (pos2 == -1) { // no more boundaries found 0073 part = m_src.mid(pos1, m_src.length() - pos1); //take the rest of the string 0074 m_parts.append(part); 0075 pos1 = -1; 0076 pos2 = -1; //break; 0077 } else { 0078 part = m_src.mid(pos1, pos2 - pos1 - 1); // pos2 - 1 (\n) is part of the boundary (see RFC 2046, section 5.1.1) 0079 m_parts.append(part); 0080 pos2 += blen; //pos2 points now to the first character after the boundary 0081 if (m_src[pos2] == '-' && m_src[pos2 + 1] == '-') { //end-boundary 0082 pos1 = pos2 + 2; //pos1 points now to the character directly after the end-boundary 0083 0084 if ((pos1 = m_src.indexOf('\n', pos1)) > -1) { //skip the rest of this line 0085 //everything after the end-boundary is considered as the epilouge 0086 m_epilouge = m_src.mid(pos1 + 1, m_src.length() - pos1 - 1); 0087 } 0088 pos1 = -1; 0089 pos2 = -1; //break 0090 } else { 0091 pos1 = pos2; //the search continues ... 0092 } 0093 } 0094 } 0095 } 0096 0097 return !m_parts.isEmpty(); 0098 } 0099 0100 //============================================================================= 0101 0102 NonMimeParser::NonMimeParser(const QByteArray &src) : 0103 m_src(src), m_partNr(-1), m_totalNr(-1) 0104 { 0105 } 0106 0107 NonMimeParser::~NonMimeParser() = default; 0108 0109 /** 0110 * try to guess the mimetype from the file-extension 0111 */ 0112 0113 QByteArray NonMimeParser::guessMimeType(const QByteArray &fileName) 0114 { 0115 QByteArray tmp; 0116 QByteArray mimeType; 0117 0118 if (!fileName.isEmpty()) { 0119 int pos = fileName.lastIndexOf('.'); 0120 if (pos++ != -1) { 0121 tmp = fileName.mid(pos, fileName.length() - pos).toUpper(); 0122 if (tmp == "JPG" || tmp == "JPEG") { 0123 mimeType = QByteArrayLiteral("image/jpeg"); 0124 } else if (tmp == "GIF") { 0125 mimeType = QByteArrayLiteral("image/gif"); 0126 } else if (tmp == "PNG") { 0127 mimeType = QByteArrayLiteral("image/png"); 0128 } else if (tmp == "TIFF" || tmp == "TIF") { 0129 mimeType = QByteArrayLiteral("image/tiff"); 0130 } else if (tmp == "XPM") { 0131 mimeType = QByteArrayLiteral("image/x-xpixmap"); 0132 } else if (tmp == "XBM") { 0133 mimeType = QByteArrayLiteral("image/x-xbitmap"); 0134 } else if (tmp == "BMP") { 0135 mimeType = QByteArrayLiteral("image/bmp"); 0136 } else if (tmp == "TXT" || 0137 tmp == "ASC" || 0138 tmp == "H" || 0139 tmp == "C" || 0140 tmp == "CC" || 0141 tmp == "CPP") { 0142 mimeType = QByteArrayLiteral("text/plain"); 0143 } else if (tmp == "HTML" || tmp == "HTM") { 0144 mimeType = QByteArrayLiteral("text/html"); 0145 } else { 0146 mimeType = QByteArrayLiteral("application/octet-stream"); 0147 } 0148 } else { 0149 mimeType = QByteArrayLiteral("application/octet-stream"); 0150 } 0151 } else { 0152 mimeType = QByteArrayLiteral("application/octet-stream"); 0153 } 0154 0155 return mimeType; 0156 } 0157 0158 //============================================================================== 0159 0160 UUEncoded::UUEncoded(const QByteArray &src, const QByteArray &subject) : 0161 NonMimeParser(src), m_subject(subject) 0162 {} 0163 0164 bool UUEncoded::parse() 0165 { 0166 int currentPos = 0; 0167 bool success = true; 0168 bool firstIteration = true; 0169 0170 const auto srcStr = QString::fromLatin1(m_src); 0171 const QRegularExpression beginRegex(QStringLiteral("begin [0-9][0-9][0-9]")); 0172 const QRegularExpression subjectRegex(QStringLiteral("[0-9]+/[0-9]+")); 0173 0174 while (success) { 0175 int beginPos = currentPos; 0176 int uuStart = currentPos; 0177 int endPos = 0; 0178 int lineCount = 0; 0179 int MCount = 0; 0180 int pos = 0; 0181 int len = 0; 0182 bool containsBegin = false; 0183 bool containsEnd = false; 0184 QByteArray tmp; 0185 QByteArray fileName; 0186 0187 if ((beginPos = srcStr.indexOf(beginRegex, currentPos)) > -1 && 0188 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) { 0189 containsBegin = true; 0190 uuStart = m_src.indexOf('\n', beginPos); 0191 if (uuStart == -1) { //no more line breaks found, we give up 0192 success = false; 0193 break; 0194 } else { 0195 uuStart++; //points now at the beginning of the next line 0196 } 0197 } else { 0198 beginPos = currentPos; 0199 } 0200 0201 if ((endPos = m_src.indexOf("\nend", (uuStart > 0) ? uuStart - 1 : 0)) == -1) { 0202 endPos = m_src.length(); //no end found 0203 } else { 0204 containsEnd = true; 0205 } 0206 0207 if ((containsBegin && containsEnd) || firstIteration) { 0208 0209 //printf("beginPos=%d , uuStart=%d , endPos=%d\n", beginPos, uuStart, endPos); 0210 //all lines in a uuencoded text start with 'M' 0211 for (int idx = uuStart; idx < endPos; idx++) { 0212 if (m_src[idx] == '\n') { 0213 lineCount++; 0214 if (idx + 1 < endPos && m_src[idx + 1] == 'M') { 0215 idx++; 0216 MCount++; 0217 } 0218 } 0219 } 0220 0221 //printf("lineCount=%d , MCount=%d\n", lineCount, MCount); 0222 if (MCount == 0 || (lineCount - MCount) > 10 || 0223 ((!containsBegin || !containsEnd) && (MCount < 15))) { 0224 // harder check for split-articles 0225 success = false; 0226 break; //too many "non-M-Lines" found, we give up 0227 } 0228 0229 if ((!containsBegin || !containsEnd) && !m_subject.isNull()) { 0230 // message may be split up => parse subject 0231 const auto match = 0232 subjectRegex.match(QLatin1StringView(m_subject)); 0233 pos = match.capturedStart(0); 0234 len = match.capturedLength(0); 0235 if (pos != -1) { 0236 tmp = m_subject.mid(pos, len); 0237 pos = tmp.indexOf('/'); 0238 m_partNr = tmp.left(pos).toInt(); 0239 m_totalNr = tmp.right(tmp.length() - pos - 1).toInt(); 0240 } else { 0241 success = false; 0242 break; //no "part-numbers" found in the subject, we give up 0243 } 0244 } 0245 0246 //everything before "begin" is text 0247 if (beginPos > 0) { 0248 m_text.append(m_src.mid(currentPos, beginPos - currentPos)); 0249 } 0250 0251 if (containsBegin) { 0252 //everything between "begin ### " and the next LF is considered as the filename 0253 fileName = m_src.mid(beginPos + 10, uuStart - beginPos - 11); 0254 } else { 0255 fileName = ""; 0256 } 0257 m_filenames.append(fileName); 0258 //everything between "begin" and "end" is uuencoded 0259 m_bins.append(m_src.mid(uuStart, endPos - uuStart + 1)); 0260 m_mimeTypes.append(guessMimeType(fileName)); 0261 firstIteration = false; 0262 0263 int next = m_src.indexOf('\n', endPos + 1); 0264 if (next == -1) { //no more line breaks found, we give up 0265 success = false; 0266 break; 0267 } else { 0268 next++; //points now at the beginning of the next line 0269 } 0270 currentPos = next; 0271 0272 } else { 0273 success = false; 0274 } 0275 } 0276 0277 // append trailing text part of the article 0278 m_text.append(m_src.right(m_src.length() - currentPos)); 0279 0280 return ((!m_bins.isEmpty()) || isPartial()); 0281 } 0282 0283 //============================================================================== 0284 0285 YENCEncoded::YENCEncoded(const QByteArray &src) : 0286 NonMimeParser(src) 0287 { 0288 } 0289 0290 bool YENCEncoded::yencMeta(QByteArray &src, const QByteArray &name, int *value) 0291 { 0292 bool found = false; 0293 QByteArray sought = name + '='; 0294 0295 int iPos = src.indexOf(sought); 0296 if (iPos > -1) { 0297 int pos1 = src.indexOf(' ', iPos); 0298 int pos2 = src.indexOf('\r', iPos); 0299 int pos3 = src.indexOf('\t', iPos); 0300 int pos4 = src.indexOf('\n', iPos); 0301 if (pos2 >= 0 && (pos1 < 0 || pos1 > pos2)) { 0302 pos1 = pos2; 0303 } 0304 if (pos3 >= 0 && (pos1 < 0 || pos1 > pos3)) { 0305 pos1 = pos3; 0306 } 0307 if (pos4 >= 0 && (pos1 < 0 || pos1 > pos4)) { 0308 pos1 = pos4; 0309 } 0310 iPos = src.lastIndexOf('=', pos1) + 1; 0311 if (iPos < pos1) { 0312 char c = src.at(iPos); 0313 if (c >= '0' && c <= '9') { 0314 found = true; 0315 *value = src.mid(iPos, pos1 - iPos).toInt(); 0316 } 0317 } 0318 } 0319 return found; 0320 } 0321 0322 bool YENCEncoded::parse() 0323 { 0324 int currentPos = 0; 0325 bool success = true; 0326 while (success) { 0327 int beginPos = currentPos; 0328 int yencStart = currentPos; 0329 bool containsPart = false; 0330 QByteArray fileName; 0331 0332 if ((beginPos = m_src.indexOf("=ybegin ", currentPos)) > -1 && 0333 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) { 0334 yencStart = m_src.indexOf('\n', beginPos); 0335 if (yencStart == -1) { // no more line breaks found, give up 0336 success = false; 0337 break; 0338 } else { 0339 yencStart++; 0340 if (m_src.indexOf("=ypart", yencStart) == yencStart) { 0341 containsPart = true; 0342 yencStart = m_src.indexOf('\n', yencStart); 0343 if (yencStart == -1) { 0344 success = false; 0345 break; 0346 } 0347 yencStart++; 0348 } 0349 } 0350 // Try to identify yenc meta data 0351 0352 // Filenames can contain any embedded chars until end of line 0353 QByteArray meta = m_src.mid(beginPos, yencStart - beginPos); 0354 int namePos = meta.indexOf("name="); 0355 if (namePos == -1) { 0356 success = false; 0357 break; 0358 } 0359 int eolPos = meta.indexOf('\r', namePos); 0360 if (eolPos == -1) { 0361 eolPos = meta.indexOf('\n', namePos); 0362 } 0363 if (eolPos == -1) { 0364 success = false; 0365 break; 0366 } 0367 fileName = meta.mid(namePos + 5, eolPos - (namePos + 5)); 0368 0369 // Other metadata is integer 0370 int yencLine; 0371 if (!yencMeta(meta, "line", ¥cLine)) { 0372 success = false; 0373 break; 0374 } 0375 int yencSize; 0376 if (!yencMeta(meta, "size", ¥cSize)) { 0377 success = false; 0378 break; 0379 } 0380 0381 int partBegin; 0382 int partEnd; 0383 if (containsPart) { 0384 if (!yencMeta(meta, "part", &m_partNr)) { 0385 success = false; 0386 break; 0387 } 0388 if (!yencMeta(meta, "begin", &partBegin) || 0389 !yencMeta(meta, "end", &partEnd)) { 0390 success = false; 0391 break; 0392 } 0393 if (!yencMeta(meta, "total", &m_totalNr)) { 0394 m_totalNr = m_partNr + 1; 0395 } 0396 if (yencSize == partEnd - partBegin + 1) { 0397 m_totalNr = 1; 0398 } else { 0399 yencSize = partEnd - partBegin + 1; 0400 } 0401 } 0402 0403 // We have a valid yenc header; now we extract the binary data 0404 int totalSize = 0; 0405 int pos = yencStart; 0406 int len = m_src.length(); 0407 bool lineStart = true; 0408 int lineLength = 0; 0409 bool containsEnd = false; 0410 QByteArray binary; 0411 binary.resize(yencSize); 0412 while (pos < len) { 0413 int ch = m_src.at(pos); 0414 if (ch < 0) { 0415 ch += 256; 0416 } 0417 if (ch == '\r') { 0418 if (lineLength != yencLine && totalSize != yencSize) { 0419 break; 0420 } 0421 pos++; 0422 } else if (ch == '\n') { 0423 lineStart = true; 0424 lineLength = 0; 0425 pos++; 0426 } else { 0427 if (ch == '=') { 0428 if (pos + 1 < len) { 0429 ch = m_src.at(pos + 1); 0430 if (lineStart && ch == 'y') { 0431 containsEnd = true; 0432 break; 0433 } 0434 pos += 2; 0435 ch -= 64 + 42; 0436 if (ch < 0) { 0437 ch += 256; 0438 } 0439 if (totalSize >= yencSize) { 0440 break; 0441 } 0442 binary[totalSize++] = ch; 0443 lineLength++; 0444 } else { 0445 break; 0446 } 0447 } else { 0448 ch -= 42; 0449 if (ch < 0) { 0450 ch += 256; 0451 } 0452 if (totalSize >= yencSize) { 0453 break; 0454 } 0455 binary[totalSize++] = ch; 0456 lineLength++; 0457 pos++; 0458 } 0459 lineStart = false; 0460 } 0461 } 0462 0463 if (!containsEnd) { 0464 success = false; 0465 break; 0466 } 0467 if (totalSize != yencSize) { 0468 success = false; 0469 break; 0470 } 0471 0472 // pos now points to =yend; get end data 0473 eolPos = m_src.indexOf('\n', pos); 0474 if (eolPos == -1) { 0475 success = false; 0476 break; 0477 } 0478 meta = m_src.mid(pos, eolPos - pos); 0479 if (!yencMeta(meta, "size", &totalSize)) { 0480 success = false; 0481 break; 0482 } 0483 if (totalSize != yencSize) { 0484 success = false; 0485 break; 0486 } 0487 0488 m_filenames.append(fileName); 0489 m_mimeTypes.append(guessMimeType(fileName)); 0490 m_bins.append(binary); 0491 0492 //everything before "begin" is text 0493 if (beginPos > 0) { 0494 m_text.append(m_src.mid(currentPos, beginPos - currentPos)); 0495 } 0496 currentPos = eolPos + 1; 0497 0498 } else { 0499 success = false; 0500 } 0501 } 0502 0503 // append trailing text part of the article 0504 m_text.append(m_src.right(m_src.length() - currentPos)); 0505 0506 return !m_bins.isEmpty(); 0507 } 0508 0509 } // namespace Parser 0510 0511 } // namespace KMime