File indexing completed on 2024-04-21 16:06:14

0001 /*
0002   kmime_charfreq.cpp
0003 
0004   KMime, the KDE Internet mail/usenet news message library.
0005   SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
0006 
0007   SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009 
0010 /**
0011   @file
0012   This file is part of the API for handling MIME data and
0013   defines the CharFreq class.
0014 
0015   @brief
0016   Defines the CharFreq class.
0017 
0018   @authors Marc Mutz \<mutz@kde.org\>
0019 */
0020 
0021 #include "kmime_charfreq_p.h"
0022 #include "kmime_debug.h"
0023 
0024 using namespace KMime;
0025 
0026 CharFreq::CharFreq(QByteArrayView buf)
0027     : mNUL(0),
0028       mCTL(0),
0029       mCR(0), mLF(0),
0030       mCRLF(0),
0031       mPrintable(0),
0032       mEightBit(0),
0033       mTotal(0),
0034       mLineMin(0xffffffff),
0035       mLineMax(0)
0036 {
0037     if (!buf.isEmpty()) {
0038         count(buf.data(), buf.size());
0039     }
0040 }
0041 
0042 static inline bool isWS(char ch)
0043 {
0044     return (ch == '\t' || ch == ' ');
0045 }
0046 
0047 void CharFreq::count(const char *it, size_t len)
0048 {
0049     const char *end = it + len;
0050     uint currentLineLength = 0;
0051     // initialize the prevChar with LF so that From_ detection works w/o
0052     // special-casing:
0053     char prevChar = '\n';
0054     char prevPrevChar = 0;
0055 
0056     for (; it != end ; ++it) {
0057         ++currentLineLength;
0058         switch (*it) {
0059         case '\0': ++mNUL; break;
0060         case '\r': ++mCR;  break;
0061         case '\n': ++mLF;
0062             if (prevChar == '\r') {
0063                 --currentLineLength; ++mCRLF;
0064             }
0065             if (currentLineLength >= mLineMax) {
0066                 mLineMax = currentLineLength - 1;
0067             }
0068             if (currentLineLength <= mLineMin) {
0069                 mLineMin = currentLineLength - 1;
0070             }
0071             if (!mTrailingWS) {
0072                 if (isWS(prevChar) ||
0073                         (prevChar == '\r' && isWS(prevPrevChar))) {
0074                     mTrailingWS = true;
0075                 }
0076             }
0077             currentLineLength = 0;
0078             break;
0079         case 'F': // check for lines starting with From_ if not found already:
0080             if (!mLeadingFrom) {
0081                 if (prevChar == '\n' && end - it >= 5 &&
0082                         !qstrncmp("From ", it, 5)) {
0083                     mLeadingFrom = true;
0084                 }
0085             }
0086             ++mPrintable;
0087             break;
0088         default: {
0089             uchar c = *it;
0090             if (c == '\t' || (c >= ' ' && c <= '~')) {
0091                 ++mPrintable;
0092             } else if (c == 127 || c < ' ') {
0093                 ++mCTL;
0094             } else {
0095                 ++mEightBit;
0096             }
0097         }
0098         }
0099         prevPrevChar = prevChar;
0100         prevChar = *it;
0101     }
0102 
0103     // consider the length of the last line
0104     if (currentLineLength >= mLineMax) {
0105         mLineMax = currentLineLength;
0106     }
0107     if (currentLineLength <= mLineMin) {
0108         mLineMin = currentLineLength;
0109     }
0110 
0111     // check whether the last character is tab or space
0112     if (isWS(prevChar)) {
0113         mTrailingWS = true;
0114     }
0115 
0116     mTotal = len;
0117 }
0118 
0119 bool CharFreq::isEightBitData() const
0120 {
0121     return type() == EightBitData;
0122 }
0123 
0124 bool CharFreq::isEightBitText() const
0125 {
0126     return type() == EightBitText;
0127 }
0128 
0129 bool CharFreq::isSevenBitData() const
0130 {
0131     return type() == SevenBitData;
0132 }
0133 
0134 bool CharFreq::isSevenBitText() const
0135 {
0136     return type() == SevenBitText;
0137 }
0138 
0139 bool CharFreq::hasTrailingWhitespace() const
0140 {
0141     return mTrailingWS;
0142 }
0143 
0144 bool CharFreq::hasLeadingFrom() const
0145 {
0146     return mLeadingFrom;
0147 }
0148 
0149 CharFreq::Type CharFreq::type() const
0150 {
0151 #if 0
0152     qCDebug(KMIME_LOG)("Total: %d; NUL: %d; CTL: %d;\n"
0153            "CR: %d; LF: %d; CRLF: %d;\n"
0154            "lineMin: %d; lineMax: %d;\n"
0155            "printable: %d; eightBit: %d;\n"
0156            "trailing whitespace: %s;\n"
0157            "leading 'From ': %s;\n",
0158            total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
0159            printable, eightBit,
0160            mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no");
0161 #endif
0162     if (mNUL) {   // must be binary
0163         return Binary;
0164     }
0165 
0166     // doesn't contain NUL's:
0167     if (mEightBit) {
0168         if (mLineMax > 988) {
0169             return EightBitData; // not allowed in 8bit
0170         }
0171         if ((mLF != mCRLF && mCRLF > 0) || mCR != mCRLF || controlCodesRatio() > 0.2) {
0172             return EightBitData;
0173         }
0174         return EightBitText;
0175     }
0176 
0177     // doesn't contain NUL's, nor 8bit chars:
0178     if (mLineMax > 988) {
0179         return SevenBitData;
0180     }
0181     if ((mLF != mCRLF && mCRLF > 0) || mCR != mCRLF || controlCodesRatio() > 0.2) {
0182         return SevenBitData;
0183     }
0184 
0185     // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
0186     return SevenBitText;
0187 }
0188 
0189 float CharFreq::printableRatio() const
0190 {
0191     if (mTotal) {
0192         return float(mPrintable) / float(mTotal);
0193     } else {
0194         return 0;
0195     }
0196 }
0197 
0198 float CharFreq::controlCodesRatio() const
0199 {
0200     if (mTotal) {
0201         return float(mCTL) / float(mTotal);
0202     } else {
0203         return 0;
0204     }
0205 }
0206