kimap2/src/rfccodecs.cpp

0001 /**********************************************************************
0002  *
0003  *   rfccodecs.cpp - handler for various rfc/mime encodings
0004  *   Copyright (C) 2000 s.carstens@gmx.de
0005  *
0006  *   This library is free software; you can redistribute it and/or
0007  *   modify it under the terms of the GNU Library General Public
0008  *   License as published by the Free Software Foundation; either
0009  *   version 2 of the License, or (at your option) any later version.
0010  *
0011  *   This library is distributed in the hope that it will be useful,
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
0014  *   Library General Public License for more details.
0015  *
0016  *   You should have received a copy of the GNU Library General Public License
0017  *   along with this library; see the file COPYING.LIB.  If not, write to
0018  *   the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
0019  *   Boston, MA 02110-1301, USA.
0020  *
0021  *********************************************************************/
0022 /**
0023  * @file
0024  * This file is part of the IMAP support library and defines the
0025  * RfcCodecs class.
0026  *
0027  * @brief
0028  * Defines the RfcCodecs class.
0029  *
0030  * @author Sven Carstens
0031  */
0032
0033 #include "rfccodecs.h"
0034
0035 #include <ctype.h>
0036 #include <sys/types.h>
0037
0038 #include <stdio.h>
0039 #include <stdlib.h>
0040
0041 #include <QtCore/QTextCodec>
0042 #include <QtCore/QBuffer>
0043 #include <QtCore/QByteArray>
0044 #include <QtCore/QLatin1Char>
0045 #include <kcodecs.h>
0046
0047 using namespace KIMAP2;
0048
0049 // This part taken from rfc 2192 IMAP URL Scheme. C. Newman. September 1997.
0050 // adapted to QT-Toolkit by Sven Carstens <s.carstens@gmx.de> 2000
0051
0052 //@cond PRIVATE
0053 static const unsigned char base64chars[] =
0054     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
0055 #define UNDEFINED 64
0056 #define MAXLINE  76
0057 static const char especials[17] = "()<>@,;:\"/[]?.= ";
0058
0059 /* UTF16 definitions */
0060 #define UTF16MASK       0x03FFUL
0061 #define UTF16SHIFT      10
0062 #define UTF16BASE       0x10000UL
0063 #define UTF16HIGHSTART  0xD800UL
0064 #define UTF16HIGHEND    0xDBFFUL
0065 #define UTF16LOSTART    0xDC00UL
0066 #define UTF16LOEND      0xDFFFUL
0067 //@endcond
0068
0069 //-----------------------------------------------------------------------------
0070 QByteArray KIMAP2::decodeImapFolderName(const QByteArray &inSrc)
0071 {
0072     unsigned char c, i, bitcount;
0073     unsigned long ucs4, utf16, bitbuf;
0074     unsigned char base64[256], utf8[6];
0075     unsigned int srcPtr = 0;
0076     QByteArray dst;
0077     QByteArray src = inSrc;
0078     uint srcLen = inSrc.length();
0079
0080     /* initialize modified base64 decoding table */
0081     memset(base64, UNDEFINED, sizeof(base64));
0082     for (i = 0; i < sizeof(base64chars); ++i) {
0083         base64[(int)base64chars[i]] = i;
0084     }
0085
0086     /* loop until end of string */
0087     while (srcPtr < srcLen) {
0088         c = src[srcPtr++];
0089         /* deal with literal characters and &- */
0090         if (c != '&' || src[srcPtr] == '-') {
0091             /* encode literally */
0092             dst += c;
0093             /* skip over the '-' if this is an &- sequence */
0094             if (c == '&') {
0095                 srcPtr++;
0096             }
0097         } else {
0098             /* convert modified UTF-7 -> UTF-16 -> UCS-4 -> UTF-8 -> HEX */
0099             bitbuf = 0;
0100             bitcount = 0;
0101             ucs4 = 0;
0102             while ((c = base64[(unsigned char)src[srcPtr]]) != UNDEFINED) {
0103                 ++srcPtr;
0104                 bitbuf = (bitbuf << 6) | c;
0105                 bitcount += 6;
0106                 /* enough bits for a UTF-16 character? */
0107                 if (bitcount >= 16) {
0108                     bitcount -= 16;
0109                     utf16 = (bitcount ? bitbuf >> bitcount : bitbuf) & 0xffff;
0110                     /* convert UTF16 to UCS4 */
0111                     if (utf16 >= UTF16HIGHSTART && utf16 <= UTF16HIGHEND) {
0112                         ucs4 = (utf16 - UTF16HIGHSTART) << UTF16SHIFT;
0113                         continue;
0114                     } else if (utf16 >= UTF16LOSTART && utf16 <= UTF16LOEND) {
0115                         ucs4 += utf16 - UTF16LOSTART + UTF16BASE;
0116                     } else {
0117                         ucs4 = utf16;
0118                     }
0119                     /* convert UTF-16 range of UCS4 to UTF-8 */
0120                     if (ucs4 <= 0x7fUL) {
0121                         utf8[0] = ucs4;
0122                         i = 1;
0123                     } else if (ucs4 <= 0x7ffUL) {
0124                         utf8[0] = 0xc0 | (ucs4 >> 6);
0125                         utf8[1] = 0x80 | (ucs4 & 0x3f);
0126                         i = 2;
0127                     } else if (ucs4 <= 0xffffUL) {
0128                         utf8[0] = 0xe0 | (ucs4 >> 12);
0129                         utf8[1] = 0x80 | ((ucs4 >> 6) & 0x3f);
0130                         utf8[2] = 0x80 | (ucs4 & 0x3f);
0131                         i = 3;
0132                     } else {
0133                         utf8[0] = 0xf0 | (ucs4 >> 18);
0134                         utf8[1] = 0x80 | ((ucs4 >> 12) & 0x3f);
0135                         utf8[2] = 0x80 | ((ucs4 >> 6) & 0x3f);
0136                         utf8[3] = 0x80 | (ucs4 & 0x3f);
0137                         i = 4;
0138                     }
0139                     /* copy it */
0140                     for (c = 0; c < i; ++c) {
0141                         dst += utf8[c];
0142                     }
0143                 }
0144             }
0145             /* skip over trailing '-' in modified UTF-7 encoding */
0146             if (src[srcPtr] == '-') {
0147                 ++srcPtr;
0148             }
0149         }
0150     }
0151     return dst;
0152 }
0153
0154 QString KIMAP2::decodeImapFolderName(const QString &inSrc)
0155 {
0156     return QString::fromUtf8(decodeImapFolderName(inSrc.toUtf8()).constData());
0157 }
0158
0159 //-----------------------------------------------------------------------------
0160
0161 QByteArray KIMAP2::quoteIMAP(const QByteArray &src)
0162 {
0163     uint len = src.length();
0164     QByteArray result;
0165     result.reserve(2 * len);
0166     for (unsigned int i = 0; i < len; i++) {
0167         if (src[i] == '"' || src[i] == '\\') {
0168             result += '\\';
0169         }
0170         result += src[i];
0171     }
0172     result.squeeze();
0173     return result;
0174 }
0175
0176 QString KIMAP2::quoteIMAP(const QString &src)
0177 {
0178     uint len = src.length();
0179     QString result;
0180     result.reserve(2 * len);
0181     for (unsigned int i = 0; i < len; i++) {
0182         if (src[i] == QLatin1Char('"') || src[i] == QLatin1Char('\\')) {
0183             result += QLatin1Char('\\');
0184         }
0185         result += src[i];
0186     }
0187     //result.squeeze(); - unnecessary and slow
0188     return result;
0189 }
0190
0191 //-----------------------------------------------------------------------------
0192 QString KIMAP2::encodeImapFolderName(const QString &inSrc)
0193 {
0194     return QString::fromUtf8(encodeImapFolderName(inSrc.toUtf8()).constData());
0195 }
0196
0197 QByteArray KIMAP2::encodeImapFolderName(const QByteArray &inSrc)
0198 {
0199     unsigned int utf8pos, utf8total, c, utf7mode, bitstogo, utf16flag;
0200     unsigned int ucs4, bitbuf;
0201     QByteArray src = inSrc;
0202     QByteArray dst;
0203
0204     int srcPtr = 0;
0205     utf7mode = 0;
0206     utf8total = 0;
0207     bitstogo = 0;
0208     utf8pos = 0;
0209     bitbuf = 0;
0210     ucs4 = 0;
0211     while (srcPtr < src.length()) {
0212         c = (unsigned char)src[srcPtr++];
0213         /* normal character? */
0214         if (c >= ' ' && c <= '~') {
0215             /* switch out of UTF-7 mode */
0216             if (utf7mode) {
0217                 if (bitstogo) {
0218                     dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
0219                     bitstogo = 0;
0220                 }
0221                 dst += '-';
0222                 utf7mode = 0;
0223             }
0224             dst += c;
0225             /* encode '&' as '&-' */
0226             if (c == '&') {
0227                 dst += '-';
0228             }
0229             continue;
0230         }
0231         /* switch to UTF-7 mode */
0232         if (!utf7mode) {
0233             dst += '&';
0234             utf7mode = 1;
0235         }
0236         /* Encode US-ASCII characters as themselves */
0237         if (c < 0x80) {
0238             ucs4 = c;
0239             utf8total = 1;
0240         } else if (utf8total) {
0241             /* save UTF8 bits into UCS4 */
0242             ucs4 = (ucs4 << 6) | (c & 0x3FUL);
0243             if (++utf8pos < utf8total) {
0244                 continue;
0245             }
0246         } else {
0247             utf8pos = 1;
0248             if (c < 0xE0) {
0249                 utf8total = 2;
0250                 ucs4 = c & 0x1F;
0251             } else if (c < 0xF0) {
0252                 utf8total = 3;
0253                 ucs4 = c & 0x0F;
0254             } else {
0255                 /* NOTE: can't convert UTF8 sequences longer than 4 */
0256                 utf8total = 4;
0257                 ucs4 = c & 0x03;
0258             }
0259             continue;
0260         }
0261         /* loop to split ucs4 into two utf16 chars if necessary */
0262         utf8total = 0;
0263         do {
0264             if (ucs4 >= UTF16BASE) {
0265                 ucs4 -= UTF16BASE;
0266                 bitbuf =
0267                     (bitbuf << 16) | ((ucs4 >> UTF16SHIFT) + UTF16HIGHSTART);
0268                 ucs4 = (ucs4 & UTF16MASK) + UTF16LOSTART;
0269                 utf16flag = 1;
0270             } else {
0271                 bitbuf = (bitbuf << 16) | ucs4;
0272                 utf16flag = 0;
0273             }
0274             bitstogo += 16;
0275             /* spew out base64 */
0276             while (bitstogo >= 6) {
0277                 bitstogo -= 6;
0278                 dst +=
0279                     base64chars[(bitstogo ? (bitbuf >> bitstogo) : bitbuf) & 0x3F];
0280             }
0281         } while (utf16flag);
0282     }
0283     /* if in UTF-7 mode, finish in ASCII */
0284     if (utf7mode) {
0285         if (bitstogo) {
0286             dst += base64chars[(bitbuf << (6 - bitstogo)) & 0x3F];
0287         }
0288         dst += '-';
0289     }
0290     return quoteIMAP(dst);
0291 }
0292
0293 //-----------------------------------------------------------------------------
0294 QTextCodec *KIMAP2::codecForName(const QString &str)
0295 {
0296     if (str.isEmpty()) {
0297         return Q_NULLPTR;
0298     }
0299     return QTextCodec::codecForName(str.toLower().
0300                                     replace(QStringLiteral("windows"), QStringLiteral("cp")).toLatin1());
0301 }
0302
0303 //-----------------------------------------------------------------------------
0304 const QString KIMAP2::decodeRFC2047String(const QString &str)
0305 {
0306     QString throw_away;
0307
0308     return decodeRFC2047String(str, throw_away);
0309 }
0310
0311 //-----------------------------------------------------------------------------
0312 const QString KIMAP2::decodeRFC2047String(const QString &str,
0313         QString &charset)
0314 {
0315     QString throw_away;
0316
0317     return decodeRFC2047String(str, charset, throw_away);
0318 }
0319
0320 //-----------------------------------------------------------------------------
0321 const QString KIMAP2::decodeRFC2047String(const QString &str,
0322         QString &charset,
0323         QString &language)
0324 {
0325     //do we have a rfc string
0326     if (!str.contains(QStringLiteral("=?"))) {
0327         return str;
0328     }
0329
0330     // FIXME get rid of the conversion?
0331     QByteArray aStr = str.toLatin1();   // QString.length() means Unicode chars
0332     QByteArray result;
0333     char *pos, *beg, *end, *mid = Q_NULLPTR;
0334     QByteArray cstr;
0335     char encoding = 0, ch;
0336     bool valid;
0337     const int maxLen = 200;
0338     int i;
0339
0340 //  result.truncate(aStr.length());
0341     for (pos = aStr.data(); *pos; pos++) {
0342         if (pos[0] != '=' || pos[1] != '?') {
0343             result += *pos;
0344             continue;
0345         }
0346         beg = pos + 2;
0347         end = beg;
0348         valid = true;
0349         // parse charset name
0350         for (i = 2, pos += 2;
0351                 i < maxLen &&
0352                 (*pos != '?' && (ispunct(*pos) || isalnum(*pos)));
0353                 i++) {
0354             pos++;
0355         }
0356         if (*pos != '?' || i < 4 || i >= maxLen) {
0357             valid = false;
0358         } else {
0359             charset = QLatin1String(QByteArray(beg, i - 1));    // -2 + 1 for the zero
0360             int pt = charset.lastIndexOf(QLatin1Char('*'));
0361             if (pt != -1) {
0362                 // save language for later usage
0363                 language = charset.right(charset.length() - pt - 1);
0364
0365                 // tie off language as defined in rfc2047
0366                 charset.truncate(pt);
0367             }
0368             // get encoding and check delimiting question marks
0369             encoding = toupper(pos[1]);
0370             if (pos[2] != '?' ||
0371                     (encoding != 'Q' && encoding != 'B' &&
0372                      encoding != 'q' && encoding != 'b')) {
0373                 valid = false;
0374             }
0375             pos += 3;
0376             i += 3;
0377 //  qCDebug(KIMAP2_LOG) << "Charset:" << charset << "- Language:" << language << "-'" << pos << "'";
0378         }
0379         if (valid) {
0380             mid = pos;
0381             // search for end of encoded part
0382             while (i < maxLen && *pos && !(*pos == '?' && *(pos + 1) == '=')) {
0383                 i++;
0384                 pos++;
0385             }
0386             end = pos + 2;//end now points to the first char after the encoded string
0387             if (i >= maxLen || !*pos) {
0388                 valid = false;
0389             }
0390         }
0391         if (valid) {
0392             ch = *pos;
0393             *pos = '\0';
0394             cstr = QByteArray(mid).left((int)(mid - pos - 1));
0395             if (encoding == 'Q') {
0396                 // decode quoted printable text
0397                 for (i = cstr.length() - 1; i >= 0; --i) {
0398                     if (cstr[i] == '_') {
0399                         cstr[i] = ' ';
0400                     }
0401                 }
0402 //    qCDebug(KIMAP2_LOG) << "before QP '"
0403 //    << cstr << "'";
0404                 cstr = KCodecs::quotedPrintableDecode(cstr);
0405 //    qCDebug(KIMAP2_LOG) << "after QP '"
0406 //    << cstr << "'";
0407             } else {
0408                 // decode base64 text
0409                 cstr = QByteArray::fromBase64(cstr);
0410             }
0411             *pos = ch;
0412             int len = cstr.length();
0413             for (i = 0; i < len; ++i) {
0414                 result += cstr[i];
0415             }
0416
0417             pos = end - 1;
0418         } else {
0419 //    qCDebug(KIMAP2_LOG) << "invalid";
0420             //result += "=?";
0421             //pos = beg -1; // because pos gets increased shortly afterwards
0422             pos = beg - 2;
0423             result += *pos++;
0424             result += *pos;
0425         }
0426     }
0427     if (!charset.isEmpty()) {
0428         QTextCodec *aCodec = codecForName(QLatin1String(charset.toLatin1()));
0429         if (aCodec) {
0430 //    qCDebug(KIMAP2_LOG) << "Codec is" << aCodec->name();
0431             return aCodec->toUnicode(result);
0432         }
0433     }
0434     return QLatin1String(result);
0435 }
0436
0437 //-----------------------------------------------------------------------------
0438 const QString KIMAP2::encodeRFC2047String(const QString &str)
0439 {
0440     return QLatin1String(encodeRFC2047String(str.toLatin1()));
0441 }
0442
0443 //-----------------------------------------------------------------------------
0444 const QByteArray KIMAP2::encodeRFC2047String(const QByteArray &str)
0445 {
0446     if (str.isEmpty()) {
0447         return str;
0448     }
0449
0450     const signed char *latin =
0451         reinterpret_cast<const signed char *>
0452         (str.data()), *l, *start, *stop;
0453     char hexcode;
0454     int numQuotes, i;
0455     int rptr = 0;
0456     // My stats show this number results in 12 resize() out of 73,000
0457     int resultLen = 3 * str.length() / 2;
0458     QByteArray result(resultLen, '\0');
0459
0460     while (*latin) {
0461         l = latin;
0462         start = latin;
0463         while (*l) {
0464             if (*l == 32) {
0465                 start = l + 1;
0466             }
0467             if (*l < 0) {
0468                 break;
0469             }
0470             l++;
0471         }
0472         if (*l) {
0473             numQuotes = 1;
0474             while (*l) {
0475                 /* The encoded word must be limited to 75 character */
0476                 for (i = 0; i < 16; ++i) {
0477                     if (*l == especials[i]) {
0478                         numQuotes++;
0479                     }
0480                 }
0481                 if (*l < 0) {
0482                     numQuotes++;
0483                 }
0484                 /* Stop after 58 = 75 - 17 characters or at "<user@host..." */
0485                 if (l - start + 2 * numQuotes >= 58 || *l == 60) {
0486                     break;
0487                 }
0488                 l++;
0489             }
0490             if (*l) {
0491                 stop = l - 1;
0492                 while (stop >= start && *stop != 32) {
0493                     stop--;
0494                 }
0495                 if (stop <= start) {
0496                     stop = l;
0497                 }
0498             } else {
0499                 stop = l;
0500             }
0501             if (resultLen - rptr - 1 <= start -  latin + 1 + 16) {
0502                 // =?iso-88...
0503                 resultLen += (start - latin + 1) * 2 + 20;   // more space
0504                 result.resize(resultLen);
0505             }
0506             while (latin < start) {
0507                 result[rptr++] = *latin;
0508                 latin++;
0509             }
0510             result.replace(rptr, 15, "=?iso-8859-1?q?");
0511             rptr += 15;
0512             if (resultLen - rptr - 1 <= 3 * (stop - latin + 1)) {
0513                 resultLen += (stop - latin + 1) * 4 + 20;   // more space
0514                 result.resize(resultLen);
0515             }
0516             while (latin < stop) {
0517                 // can add up to 3 chars/iteration
0518                 numQuotes = 0;
0519                 for (i = 0; i < 16; ++i) {
0520                     if (*latin == especials[i]) {
0521                         numQuotes = 1;
0522                     }
0523                 }
0524                 if (*latin < 0) {
0525                     numQuotes = 1;
0526                 }
0527                 if (numQuotes) {
0528                     result[rptr++] = '=';
0529                     hexcode = ((*latin & 0xF0) >> 4) + 48;
0530                     if (hexcode >= 58) {
0531                         hexcode += 7;
0532                     }
0533                     result[rptr++] = hexcode;
0534                     hexcode = (*latin & 0x0F) + 48;
0535                     if (hexcode >= 58) {
0536                         hexcode += 7;
0537                     }
0538                     result[rptr++] = hexcode;
0539                 } else {
0540                     result[rptr++] = *latin;
0541                 }
0542                 latin++;
0543             }
0544             result[rptr++] = '?';
0545             result[rptr++] = '=';
0546         } else {
0547             while (*latin) {
0548                 if (rptr == resultLen - 1) {
0549                     resultLen += 30;
0550                     result.resize(resultLen);
0551                 }
0552                 result[rptr++] = *latin;
0553                 latin++;
0554             }
0555         }
0556     }
0557     result[rptr] = 0;
0558     return result;
0559 }
0560
0561 //-----------------------------------------------------------------------------
0562 const QString KIMAP2::encodeRFC2231String(const QString &str)
0563 {
0564     if (str.isEmpty()) {
0565         return str;
0566     }
0567
0568     signed char *latin = (signed char *)calloc(1, str.length() + 1);
0569     char *latin_us = (char *)latin;
0570     strcpy(latin_us, str.toLatin1());
0571     signed char *l = latin;
0572     char hexcode;
0573     int i;
0574     bool quote;
0575     while (*l) {
0576         if (*l < 0) {
0577             break;
0578         }
0579         l++;
0580     }
0581     if (!*l) {
0582         free(latin);
0583         return str;
0584     }
0585     QByteArray result;
0586     l = latin;
0587     while (*l) {
0588         quote = *l < 0;
0589         for (i = 0; i < 16; ++i) {
0590             if (*l == especials[i]) {
0591                 quote = true;
0592             }
0593         }
0594         if (quote) {
0595             result += '%';
0596             hexcode = ((*l & 0xF0) >> 4) + 48;
0597             if (hexcode >= 58) {
0598                 hexcode += 7;
0599             }
0600             result += hexcode;
0601             hexcode = (*l & 0x0F) + 48;
0602             if (hexcode >= 58) {
0603                 hexcode += 7;
0604             }
0605             result += hexcode;
0606         } else {
0607             result += *l;
0608         }
0609         l++;
0610     }
0611     free(latin);
0612     return QLatin1String(result);
0613 }
0614
0615 //-----------------------------------------------------------------------------
0616 const QString KIMAP2::decodeRFC2231String(const QString &str)
0617 {
0618     int p = str.indexOf(QLatin1Char('\''));
0619
0620     //see if it is an rfc string
0621     if (p < 0) {
0622         return str;
0623     }
0624
0625     int l = str.lastIndexOf(QLatin1Char('\''));
0626
0627     //second is language
0628     if (p >= l) {
0629         return str;
0630     }
0631
0632     //first is charset or empty
0633     //QString charset = str.left ( p );
0634     QString st = str.mid(l + 1);
0635     //QString language = str.mid ( p + 1, l - p - 1 );
0636
0637     //qCDebug(KIMAP2_LOG) << "Charset:" << charset << "Language:" << language;
0638
0639     char ch, ch2;
0640     p = 0;
0641     while (p < (int) st.length()) {
0642         if (st.at(p) == 37) {
0643             ch = st.at(p + 1).toLatin1() - 48;
0644             if (ch > 16) {
0645                 ch -= 7;
0646             }
0647             ch2 = st.at(p + 2).toLatin1() - 48;
0648             if (ch2 > 16) {
0649                 ch2 -= 7;
0650             }
0651             st.replace(p, 1, ch * 16 + ch2);
0652             st.remove(p + 1, 2);
0653         }
0654         p++;
0655     }
0656     return st;
0657 }