File indexing completed on 2024-05-05 09:36:14
0001 /* 0002 kmime_parsers.cpp 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001 the KMime authors. 0006 See file AUTHORS for details 0007 0008 SPDX-License-Identifier: LGPL-2.0-or-later 0009 */ 0010 #include "kmime_parsers_p.h" 0011 0012 #include <QRegularExpression> 0013 0014 using namespace KMime::Parser; 0015 0016 namespace KMime 0017 { 0018 namespace Parser 0019 { 0020 0021 MultiPart::MultiPart(const QByteArray &src, const QByteArray &boundary) 0022 : m_src(src) 0023 , m_boundary(boundary) 0024 { 0025 } 0026 0027 bool MultiPart::parse() 0028 { 0029 QByteArray b = "--" + m_boundary; 0030 QByteArray part; 0031 int pos1 = 0; 0032 int pos2 = 0; 0033 int blen = b.length(); 0034 0035 m_parts.clear(); 0036 0037 //find the first valid boundary 0038 while (true) { 0039 if ((pos1 = m_src.indexOf(b, pos1)) == -1 || pos1 == 0 || 0040 m_src[pos1 - 1] == '\n') { //valid boundary found or no boundary at all 0041 break; 0042 } 0043 pos1 += blen; //boundary found but not valid => skip it; 0044 } 0045 0046 if (pos1 > -1) { 0047 pos1 += blen; 0048 if (m_src[pos1] == '-' && m_src[pos1 + 1] == '-') { 0049 // the only valid boundary is the end-boundary 0050 // this message is *really* broken 0051 pos1 = -1; //we give up 0052 } else if ((pos1 - blen) > 1) { //preamble present 0053 m_preamble = m_src.left(pos1 - blen - 1); 0054 } 0055 } 0056 0057 while (pos1 > -1 && pos2 > -1) { 0058 0059 //skip the rest of the line for the first boundary - the message-part starts here 0060 if ((pos1 = m_src.indexOf('\n', pos1)) > -1) { 0061 //now search the next linebreak 0062 //now find the next valid boundary 0063 pos2 = ++pos1; //pos1 and pos2 point now to the beginning of the next line after the boundary 0064 while (true) { 0065 if ((pos2 = m_src.indexOf(b, pos2)) == -1 || 0066 m_src[pos2 - 1] == '\n') { //valid boundary or no more boundaries found 0067 break; 0068 } 0069 pos2 += blen; //boundary is invalid => skip it; 0070 } 0071 0072 if (pos2 == -1) { // no more boundaries found 0073 part = m_src.mid(pos1, m_src.length() - pos1); //take the rest of the string 0074 m_parts.append(part); 0075 pos1 = -1; 0076 pos2 = -1; //break; 0077 } else { 0078 part = m_src.mid(pos1, pos2 - pos1 - 1); // pos2 - 1 (\n) is part of the boundary (see RFC 2046, section 5.1.1) 0079 m_parts.append(part); 0080 pos2 += blen; //pos2 points now to the first character after the boundary 0081 if (m_src[pos2] == '-' && m_src[pos2 + 1] == '-') { //end-boundary 0082 pos1 = pos2 + 2; //pos1 points now to the character directly after the end-boundary 0083 0084 if ((pos1 = m_src.indexOf('\n', pos1)) > -1) { //skip the rest of this line 0085 //everything after the end-boundary is considered as the epilouge 0086 m_epilouge = m_src.mid(pos1 + 1, m_src.length() - pos1 - 1); 0087 } 0088 pos1 = -1; 0089 pos2 = -1; //break 0090 } else { 0091 pos1 = pos2; //the search continues ... 0092 } 0093 } 0094 } 0095 } 0096 0097 return !m_parts.isEmpty(); 0098 } 0099 0100 //============================================================================= 0101 0102 NonMimeParser::NonMimeParser(const QByteArray &src) : 0103 m_src(src), m_partNr(-1), m_totalNr(-1) 0104 { 0105 } 0106 0107 NonMimeParser::~NonMimeParser() = default; 0108 0109 /** 0110 * try to guess the mimetype from the file-extension 0111 */ 0112 0113 QByteArray NonMimeParser::guessMimeType(const QByteArray &fileName) 0114 { 0115 QByteArray tmp; 0116 QByteArray mimeType; 0117 0118 if (!fileName.isEmpty()) { 0119 int pos = fileName.lastIndexOf('.'); 0120 if (pos++ != -1) { 0121 tmp = fileName.mid(pos, fileName.length() - pos).toUpper(); 0122 if (tmp == "JPG" || tmp == "JPEG") { 0123 mimeType = QByteArrayLiteral("image/jpeg"); 0124 } else if (tmp == "GIF") { 0125 mimeType = QByteArrayLiteral("image/gif"); 0126 } else if (tmp == "PNG") { 0127 mimeType = QByteArrayLiteral("image/png"); 0128 } else if (tmp == "TIFF" || tmp == "TIF") { 0129 mimeType = QByteArrayLiteral("image/tiff"); 0130 } else if (tmp == "XPM") { 0131 mimeType = QByteArrayLiteral("image/x-xpixmap"); 0132 } else if (tmp == "XBM") { 0133 mimeType = QByteArrayLiteral("image/x-xbitmap"); 0134 } else if (tmp == "BMP") { 0135 mimeType = QByteArrayLiteral("image/bmp"); 0136 } else if (tmp == "TXT" || 0137 tmp == "ASC" || 0138 tmp == "H" || 0139 tmp == "C" || 0140 tmp == "CC" || 0141 tmp == "CPP") { 0142 mimeType = QByteArrayLiteral("text/plain"); 0143 } else if (tmp == "HTML" || tmp == "HTM") { 0144 mimeType = QByteArrayLiteral("text/html"); 0145 } else { 0146 mimeType = QByteArrayLiteral("application/octet-stream"); 0147 } 0148 } else { 0149 mimeType = QByteArrayLiteral("application/octet-stream"); 0150 } 0151 } else { 0152 mimeType = QByteArrayLiteral("application/octet-stream"); 0153 } 0154 0155 return mimeType; 0156 } 0157 0158 //============================================================================== 0159 0160 UUEncoded::UUEncoded(const QByteArray &src, const QByteArray &subject) : 0161 NonMimeParser(src), m_subject(subject) 0162 {} 0163 0164 bool UUEncoded::parse() 0165 { 0166 int currentPos = 0; 0167 bool success = true; 0168 bool firstIteration = true; 0169 0170 const auto srcStr = QString::fromLatin1(m_src); 0171 const QRegularExpression beginRegex(QStringLiteral("begin [0-9][0-9][0-9]")); 0172 const QRegularExpression subjectRegex(QStringLiteral("[0-9]+/[0-9]+")); 0173 0174 while (success) { 0175 int beginPos = currentPos; 0176 int uuStart = currentPos; 0177 int endPos = 0; 0178 int lineCount = 0; 0179 int MCount = 0; 0180 int pos = 0; 0181 int len = 0; 0182 bool containsBegin = false; 0183 bool containsEnd = false; 0184 QByteArray tmp; 0185 QByteArray fileName; 0186 0187 if ((beginPos = srcStr.indexOf(beginRegex, currentPos)) > -1 && 0188 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) { 0189 containsBegin = true; 0190 uuStart = m_src.indexOf('\n', beginPos); 0191 if (uuStart == -1) { //no more line breaks found, we give up 0192 success = false; 0193 break; 0194 } else { 0195 uuStart++; //points now at the beginning of the next line 0196 } 0197 } else { 0198 beginPos = currentPos; 0199 } 0200 0201 if ((endPos = m_src.indexOf("\nend", (uuStart > 0) ? uuStart - 1 : 0)) == -1) { 0202 endPos = m_src.length(); //no end found 0203 } else { 0204 containsEnd = true; 0205 } 0206 0207 if ((containsBegin && containsEnd) || firstIteration) { 0208 0209 //printf("beginPos=%d , uuStart=%d , endPos=%d\n", beginPos, uuStart, endPos); 0210 //all lines in a uuencoded text start with 'M' 0211 for (int idx = uuStart; idx < endPos; idx++) { 0212 if (m_src[idx] == '\n') { 0213 lineCount++; 0214 if (idx + 1 < endPos && m_src[idx + 1] == 'M') { 0215 idx++; 0216 MCount++; 0217 } 0218 } 0219 } 0220 0221 //printf("lineCount=%d , MCount=%d\n", lineCount, MCount); 0222 if (MCount == 0 || (lineCount - MCount) > 10 || 0223 ((!containsBegin || !containsEnd) && (MCount < 15))) { 0224 // harder check for split-articles 0225 success = false; 0226 break; //too many "non-M-Lines" found, we give up 0227 } 0228 0229 if ((!containsBegin || !containsEnd) && !m_subject.isNull()) { 0230 // message may be split up => parse subject 0231 const auto match = subjectRegex.match(QLatin1String(m_subject)); 0232 pos = match.capturedStart(0); 0233 len = match.capturedLength(0); 0234 if (pos != -1) { 0235 tmp = m_subject.mid(pos, len); 0236 pos = tmp.indexOf('/'); 0237 m_partNr = tmp.left(pos).toInt(); 0238 m_totalNr = tmp.right(tmp.length() - pos - 1).toInt(); 0239 } else { 0240 success = false; 0241 break; //no "part-numbers" found in the subject, we give up 0242 } 0243 } 0244 0245 //everything before "begin" is text 0246 if (beginPos > 0) { 0247 m_text.append(m_src.mid(currentPos, beginPos - currentPos)); 0248 } 0249 0250 if (containsBegin) { 0251 //everything between "begin ### " and the next LF is considered as the filename 0252 fileName = m_src.mid(beginPos + 10, uuStart - beginPos - 11); 0253 } else { 0254 fileName = ""; 0255 } 0256 m_filenames.append(fileName); 0257 //everything between "begin" and "end" is uuencoded 0258 m_bins.append(m_src.mid(uuStart, endPos - uuStart + 1)); 0259 m_mimeTypes.append(guessMimeType(fileName)); 0260 firstIteration = false; 0261 0262 int next = m_src.indexOf('\n', endPos + 1); 0263 if (next == -1) { //no more line breaks found, we give up 0264 success = false; 0265 break; 0266 } else { 0267 next++; //points now at the beginning of the next line 0268 } 0269 currentPos = next; 0270 0271 } else { 0272 success = false; 0273 } 0274 } 0275 0276 // append trailing text part of the article 0277 m_text.append(m_src.right(m_src.length() - currentPos)); 0278 0279 return ((!m_bins.isEmpty()) || isPartial()); 0280 } 0281 0282 //============================================================================== 0283 0284 YENCEncoded::YENCEncoded(const QByteArray &src) : 0285 NonMimeParser(src) 0286 { 0287 } 0288 0289 bool YENCEncoded::yencMeta(QByteArray &src, const QByteArray &name, int *value) 0290 { 0291 bool found = false; 0292 QByteArray sought = name + '='; 0293 0294 int iPos = src.indexOf(sought); 0295 if (iPos > -1) { 0296 int pos1 = src.indexOf(' ', iPos); 0297 int pos2 = src.indexOf('\r', iPos); 0298 int pos3 = src.indexOf('\t', iPos); 0299 int pos4 = src.indexOf('\n', iPos); 0300 if (pos2 >= 0 && (pos1 < 0 || pos1 > pos2)) { 0301 pos1 = pos2; 0302 } 0303 if (pos3 >= 0 && (pos1 < 0 || pos1 > pos3)) { 0304 pos1 = pos3; 0305 } 0306 if (pos4 >= 0 && (pos1 < 0 || pos1 > pos4)) { 0307 pos1 = pos4; 0308 } 0309 iPos = src.lastIndexOf('=', pos1) + 1; 0310 if (iPos < pos1) { 0311 char c = src.at(iPos); 0312 if (c >= '0' && c <= '9') { 0313 found = true; 0314 *value = src.mid(iPos, pos1 - iPos).toInt(); 0315 } 0316 } 0317 } 0318 return found; 0319 } 0320 0321 bool YENCEncoded::parse() 0322 { 0323 int currentPos = 0; 0324 bool success = true; 0325 while (success) { 0326 int beginPos = currentPos; 0327 int yencStart = currentPos; 0328 bool containsPart = false; 0329 QByteArray fileName; 0330 0331 if ((beginPos = m_src.indexOf("=ybegin ", currentPos)) > -1 && 0332 (beginPos == 0 || m_src.at(beginPos - 1) == '\n')) { 0333 yencStart = m_src.indexOf('\n', beginPos); 0334 if (yencStart == -1) { // no more line breaks found, give up 0335 success = false; 0336 break; 0337 } else { 0338 yencStart++; 0339 if (m_src.indexOf("=ypart", yencStart) == yencStart) { 0340 containsPart = true; 0341 yencStart = m_src.indexOf('\n', yencStart); 0342 if (yencStart == -1) { 0343 success = false; 0344 break; 0345 } 0346 yencStart++; 0347 } 0348 } 0349 // Try to identify yenc meta data 0350 0351 // Filenames can contain any embedded chars until end of line 0352 QByteArray meta = m_src.mid(beginPos, yencStart - beginPos); 0353 int namePos = meta.indexOf("name="); 0354 if (namePos == -1) { 0355 success = false; 0356 break; 0357 } 0358 int eolPos = meta.indexOf('\r', namePos); 0359 if (eolPos == -1) { 0360 eolPos = meta.indexOf('\n', namePos); 0361 } 0362 if (eolPos == -1) { 0363 success = false; 0364 break; 0365 } 0366 fileName = meta.mid(namePos + 5, eolPos - (namePos + 5)); 0367 0368 // Other metadata is integer 0369 int yencLine; 0370 if (!yencMeta(meta, "line", ¥cLine)) { 0371 success = false; 0372 break; 0373 } 0374 int yencSize; 0375 if (!yencMeta(meta, "size", ¥cSize)) { 0376 success = false; 0377 break; 0378 } 0379 0380 int partBegin; 0381 int partEnd; 0382 if (containsPart) { 0383 if (!yencMeta(meta, "part", &m_partNr)) { 0384 success = false; 0385 break; 0386 } 0387 if (!yencMeta(meta, "begin", &partBegin) || 0388 !yencMeta(meta, "end", &partEnd)) { 0389 success = false; 0390 break; 0391 } 0392 if (!yencMeta(meta, "total", &m_totalNr)) { 0393 m_totalNr = m_partNr + 1; 0394 } 0395 if (yencSize == partEnd - partBegin + 1) { 0396 m_totalNr = 1; 0397 } else { 0398 yencSize = partEnd - partBegin + 1; 0399 } 0400 } 0401 0402 // We have a valid yenc header; now we extract the binary data 0403 int totalSize = 0; 0404 int pos = yencStart; 0405 int len = m_src.length(); 0406 bool lineStart = true; 0407 int lineLength = 0; 0408 bool containsEnd = false; 0409 QByteArray binary; 0410 binary.resize(yencSize); 0411 while (pos < len) { 0412 int ch = m_src.at(pos); 0413 if (ch < 0) { 0414 ch += 256; 0415 } 0416 if (ch == '\r') { 0417 if (lineLength != yencLine && totalSize != yencSize) { 0418 break; 0419 } 0420 pos++; 0421 } else if (ch == '\n') { 0422 lineStart = true; 0423 lineLength = 0; 0424 pos++; 0425 } else { 0426 if (ch == '=') { 0427 if (pos + 1 < len) { 0428 ch = m_src.at(pos + 1); 0429 if (lineStart && ch == 'y') { 0430 containsEnd = true; 0431 break; 0432 } 0433 pos += 2; 0434 ch -= 64 + 42; 0435 if (ch < 0) { 0436 ch += 256; 0437 } 0438 if (totalSize >= yencSize) { 0439 break; 0440 } 0441 binary[totalSize++] = ch; 0442 lineLength++; 0443 } else { 0444 break; 0445 } 0446 } else { 0447 ch -= 42; 0448 if (ch < 0) { 0449 ch += 256; 0450 } 0451 if (totalSize >= yencSize) { 0452 break; 0453 } 0454 binary[totalSize++] = ch; 0455 lineLength++; 0456 pos++; 0457 } 0458 lineStart = false; 0459 } 0460 } 0461 0462 if (!containsEnd) { 0463 success = false; 0464 break; 0465 } 0466 if (totalSize != yencSize) { 0467 success = false; 0468 break; 0469 } 0470 0471 // pos now points to =yend; get end data 0472 eolPos = m_src.indexOf('\n', pos); 0473 if (eolPos == -1) { 0474 success = false; 0475 break; 0476 } 0477 meta = m_src.mid(pos, eolPos - pos); 0478 if (!yencMeta(meta, "size", &totalSize)) { 0479 success = false; 0480 break; 0481 } 0482 if (totalSize != yencSize) { 0483 success = false; 0484 break; 0485 } 0486 0487 m_filenames.append(fileName); 0488 m_mimeTypes.append(guessMimeType(fileName)); 0489 m_bins.append(binary); 0490 0491 //everything before "begin" is text 0492 if (beginPos > 0) { 0493 m_text.append(m_src.mid(currentPos, beginPos - currentPos)); 0494 } 0495 currentPos = eolPos + 1; 0496 0497 } else { 0498 success = false; 0499 } 0500 } 0501 0502 // append trailing text part of the article 0503 m_text.append(m_src.right(m_src.length() - currentPos)); 0504 0505 return !m_bins.isEmpty(); 0506 } 0507 0508 } // namespace Parser 0509 0510 } // namespace KMime