File indexing completed on 2022-11-29 19:53:36

0001 /*
0002   kmime_codecs.cpp
0003 
0004   KMime, the KDE Internet mail/usenet news message library.
0005   SPDX-FileCopyrightText: 2001 the KMime authors.
0006   See file AUTHORS for details
0007 
0008   SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010 
0011 #include "kmime_codecs.h"
0012 #include "kmime_debug.h"
0013 #include <KCharsets>
0014 
0015 #include <QTextCodec>
0016 
0017 namespace KMime {
0018 
0019 static const char reservedCharacters[] = "\"()<>@,.;:\\[]=";
0020 
0021 QByteArray encodeRFC2047String(const QString &src, const QByteArray &charset,
0022                                bool addressHeader, bool allow8BitHeaders)
0023 {
0024     QByteArray result;
0025     int start = 0;
0026     int end = 0;
0027     bool nonAscii = false;
0028     bool ok = true;
0029     bool useQEncoding = false;
0030 
0031     // fromLatin1() is safe here, codecForName() uses toLatin1() internally
0032     const QTextCodec *codec = KCharsets::charsets()->codecForName(QString::fromLatin1(charset), ok);
0033 
0034     QByteArray usedCS;
0035     if (!ok) {
0036         //no codec available => try local8Bit and hope the best ;-)
0037         usedCS = QTextCodec::codecForLocale()->name();
0038         codec = KCharsets::charsets()->codecForName(QString::fromLatin1(usedCS), ok);
0039     } else {
0040         Q_ASSERT(codec);
0041         if (charset.isEmpty()) {
0042             usedCS = codec->name();
0043         } else {
0044             usedCS = charset;
0045         }
0046     }
0047 
0048     QTextCodec::ConverterState converterState(QTextCodec::IgnoreHeader);
0049     QByteArray encoded8Bit = codec->fromUnicode(src.constData(), src.length(), &converterState);
0050     if (converterState.invalidChars > 0) {
0051         usedCS = "utf-8";
0052         codec = QTextCodec::codecForName(usedCS);
0053         encoded8Bit = codec->fromUnicode(src);
0054     }
0055 
0056     if (usedCS.contains("8859-")) {     // use "B"-Encoding for non iso-8859-x charsets
0057         useQEncoding = true;
0058     }
0059 
0060     if (allow8BitHeaders) {
0061         return encoded8Bit;
0062     }
0063 
0064     int encoded8BitLength = encoded8Bit.length();
0065     for (int i = 0; i < encoded8BitLength; i++) {
0066         if (encoded8Bit[i] == ' ') {   // encoding starts at word boundaries
0067             start = i + 1;
0068         }
0069 
0070         // encode escape character, for japanese encodings...
0071         if (((signed char)encoded8Bit[i] < 0) || (encoded8Bit[i] == '\033') ||
0072                 (addressHeader && (strchr("\"()<>@,.;:\\[]=", encoded8Bit[i]) != nullptr))) {
0073             end = start;   // non us-ascii char found, now we determine where to stop encoding
0074             nonAscii = true;
0075             break;
0076         }
0077     }
0078 
0079     if (nonAscii) {
0080         while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
0081             // we encode complete words
0082             end++;
0083         }
0084 
0085         for (int x = end; x < encoded8Bit.length(); x++) {
0086             if (((signed char)encoded8Bit[x] < 0) || (encoded8Bit[x] == '\033') ||
0087                     (addressHeader && (strchr(reservedCharacters, encoded8Bit[x]) != nullptr))) {
0088                 end = x;     // we found another non-ascii word
0089 
0090                 while ((end < encoded8Bit.length()) && (encoded8Bit[end] != ' ')) {
0091                     // we encode complete words
0092                     end++;
0093                 }
0094             }
0095         }
0096 
0097         result = encoded8Bit.left(start) + "=?" + usedCS;
0098 
0099         if (useQEncoding) {
0100             result += "?Q?";
0101 
0102             char c;
0103             char hexcode; // "Q"-encoding implementation described in RFC 2047
0104             for (int i = start; i < end; i++) {
0105                 c = encoded8Bit[i];
0106                 if (c == ' ') {   // make the result readable with not MIME-capable readers
0107                     result += '_';
0108                 } else {
0109                     if (((c >= 'a') && (c <= 'z')) ||        // paranoid mode, encode *all* special chars to avoid problems
0110                             ((c >= 'A') && (c <= 'Z')) ||        // with "From" & "To" headers
0111                             ((c >= '0') && (c <= '9'))) {
0112                         result += c;
0113                     } else {
0114                         result += '=';                 // "stolen" from KMail ;-)
0115                         hexcode = ((c & 0xF0) >> 4) + 48;
0116                         if (hexcode >= 58) {
0117                             hexcode += 7;
0118                         }
0119                         result += hexcode;
0120                         hexcode = (c & 0x0F) + 48;
0121                         if (hexcode >= 58) {
0122                             hexcode += 7;
0123                         }
0124                         result += hexcode;
0125                     }
0126                 }
0127             }
0128         } else {
0129             result += "?B?" + encoded8Bit.mid(start, end - start).toBase64();
0130         }
0131 
0132         result += "?=";
0133         result += encoded8Bit.right(encoded8Bit.length() - end);
0134     } else {
0135         result = encoded8Bit;
0136     }
0137 
0138     return result;
0139 }
0140 
0141 QByteArray encodeRFC2047Sentence(const QString &src, const QByteArray &charset)
0142 {
0143     QByteArray result;
0144     const QChar *ch = src.constData();
0145     const int length = src.length();
0146     int pos = 0;
0147     int wordStart = 0;
0148 
0149     //qCDebug(KMIME_LOG) << "Input:" << src;
0150     // Loop over all characters of the string.
0151     // When encountering a split character, RFC-2047-encode the word before it, and add it to the result.
0152     while (pos < length) {
0153         //qCDebug(KMIME_LOG) << "Pos:" << pos << "Result:" << result << "Char:" << ch->toLatin1();
0154         const bool isAscii = ch->unicode() < 127;
0155         const bool isReserved = (strchr(reservedCharacters, ch->toLatin1()) != nullptr);
0156         if (isAscii && isReserved) {
0157             const int wordSize = pos - wordStart;
0158             if (wordSize > 0) {
0159                 const QString word = src.mid(wordStart, wordSize);
0160                 result += encodeRFC2047String(word, charset);
0161             }
0162 
0163             result += ch->toLatin1();
0164             wordStart = pos + 1;
0165         }
0166         ch++;
0167         pos++;
0168     }
0169 
0170     // Encode the last word
0171     const int wordSize = pos - wordStart;
0172     if (wordSize > 0) {
0173         const QString word = src.mid(wordStart, pos - wordStart);
0174         result += encodeRFC2047String(word, charset);
0175     }
0176 
0177     return result;
0178 }
0179 
0180 //-----------------------------------------------------------------------------
0181 QByteArray encodeRFC2231String(const QString &str, const QByteArray &charset)
0182 {
0183     if (str.isEmpty()) {
0184       return {};
0185     }
0186 
0187     const QTextCodec *codec = KCharsets::charsets()->codecForName(QString::fromLatin1(charset));
0188     QByteArray latin;
0189     if (charset == "us-ascii") {
0190         latin = str.toLatin1();
0191     } else if (codec) {
0192         latin = codec->fromUnicode(str);
0193     } else {
0194         latin = str.toLocal8Bit();
0195     }
0196 
0197     char *l;
0198     for (l = latin.data(); *l; ++l) {
0199         if (((*l & 0xE0) == 0) || (*l & 0x80)) {
0200             // *l is control character or 8-bit char
0201             break;
0202         }
0203     }
0204     if (!*l) {
0205         return latin;
0206     }
0207 
0208     QByteArray result = charset + "''";
0209     for (l = latin.data(); *l; ++l) {
0210         bool needsQuoting = (*l & 0x80) || (*l == '%');
0211         if (!needsQuoting) {
0212             const QByteArray especials = "()<>@,;:\"/[]?.= \033";
0213             int len = especials.length();
0214             for (int i = 0; i < len; i++) {
0215                 if (*l == especials[i]) {
0216                     needsQuoting = true;
0217                     break;
0218                 }
0219             }
0220         }
0221         if (needsQuoting) {
0222             result += '%';
0223             unsigned char hexcode;
0224             hexcode = ((*l & 0xF0) >> 4) + 48;
0225             if (hexcode >= 58) {
0226                 hexcode += 7;
0227             }
0228             result += hexcode;
0229             hexcode = (*l & 0x0F) + 48;
0230             if (hexcode >= 58) {
0231                 hexcode += 7;
0232             }
0233             result += hexcode;
0234         } else {
0235             result += *l;
0236         }
0237     }
0238     return result;
0239 }
0240 
0241 //-----------------------------------------------------------------------------
0242 QString decodeRFC2231String(const QByteArray &str, QByteArray &usedCS, const QByteArray &defaultCS,
0243                             bool forceCS)
0244 {
0245     int p = str.indexOf('\'');
0246     if (p < 0) {
0247         return KCharsets::charsets()->codecForName(QString::fromLatin1(defaultCS))->toUnicode(str);
0248     }
0249 
0250     QByteArray charset = str.left(p);
0251 
0252     QByteArray st = str.mid(str.lastIndexOf('\'') + 1);
0253 
0254     char ch;
0255     char ch2;
0256     p = 0;
0257     while (p < st.length()) {
0258         if (st.at(p) == 37) {
0259             // Only try to decode the percent-encoded character if the percent sign
0260             // is really followed by two other characters, see testcase at bug 163024
0261             if (p + 2 < st.length()) {
0262                 ch = st.at(p + 1) - 48;
0263                 if (ch > 16) {
0264                     ch -= 7;
0265                 }
0266                 ch2 = st.at(p + 2) - 48;
0267                 if (ch2 > 16) {
0268                     ch2 -= 7;
0269                 }
0270                 st[p] = ch * 16 + ch2;
0271                 st.remove(p + 1, 2);
0272             }
0273         }
0274         p++;
0275     }
0276     qCDebug(KMIME_LOG) << "Got pre-decoded:" << st;
0277     const QTextCodec *charsetcodec = KCharsets::charsets()->codecForName(QString::fromLatin1(charset));
0278     if (!charsetcodec || forceCS) {
0279         charsetcodec = KCharsets::charsets()->codecForName(QString::fromLatin1(defaultCS));
0280     }
0281 
0282     usedCS = charsetcodec->name();
0283     return charsetcodec->toUnicode(st);
0284 }
0285 
0286 }