File indexing completed on 2022-11-23 12:03:43

0001 /*
0002   kmime_charfreq.cpp
0003 
0004   KMime, the KDE Internet mail/usenet news message library.
0005   SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
0006 
0007   SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009 
0010 /**
0011   @file
0012   This file is part of the API for handling MIME data and
0013   defines the CharFreq class.
0014 
0015   @brief
0016   Defines the CharFreq class.
0017 
0018   @authors Marc Mutz \<mutz@kde.org\>
0019 */
0020 
0021 #include "kmime_charfreq.h"
0022 #include "kmime_debug.h"
0023 
0024 using namespace KMime;
0025 
0026 /**
0027  * Private class that helps to provide binary compatibility between releases.
0028  * @internal
0029  */
0030 //@cond PRIVATE
0031 //class KMime::CharFreq::Private
0032 //{
0033 //  public:
0034 //};
0035 //@endcond
0036 
0037 CharFreq::CharFreq(const QByteArray &buf)
0038     : mNUL(0),
0039       mCTL(0),
0040       mCR(0), mLF(0),
0041       mCRLF(0),
0042       mPrintable(0),
0043       mEightBit(0),
0044       mTotal(0),
0045       mLineMin(0xffffffff),
0046       mLineMax(0)
0047 {
0048     if (!buf.isEmpty()) {
0049         count(buf.data(), buf.size());
0050     }
0051 }
0052 
0053 CharFreq::CharFreq(const char *buf, size_t len)
0054     : mNUL(0),
0055       mCTL(0),
0056       mCR(0), mLF(0),
0057       mCRLF(0),
0058       mPrintable(0),
0059       mEightBit(0),
0060       mTotal(0),
0061       mLineMin(0xffffffff),
0062       mLineMax(0)
0063 {
0064     if (buf && len > 0) {
0065         count(buf, len);
0066     }
0067 }
0068 
0069 //@cond PRIVATE
0070 static inline bool isWS(char ch)
0071 {
0072     return (ch == '\t' || ch == ' ');
0073 }
0074 //@endcond
0075 
0076 void CharFreq::count(const char *it, size_t len)
0077 {
0078     const char *end = it + len;
0079     uint currentLineLength = 0;
0080     // initialize the prevChar with LF so that From_ detection works w/o
0081     // special-casing:
0082     char prevChar = '\n';
0083     char prevPrevChar = 0;
0084 
0085     for (; it != end ; ++it) {
0086         ++currentLineLength;
0087         switch (*it) {
0088         case '\0': ++mNUL; break;
0089         case '\r': ++mCR;  break;
0090         case '\n': ++mLF;
0091             if (prevChar == '\r') {
0092                 --currentLineLength; ++mCRLF;
0093             }
0094             if (currentLineLength >= mLineMax) {
0095                 mLineMax = currentLineLength - 1;
0096             }
0097             if (currentLineLength <= mLineMin) {
0098                 mLineMin = currentLineLength - 1;
0099             }
0100             if (!mTrailingWS) {
0101                 if (isWS(prevChar) ||
0102                         (prevChar == '\r' && isWS(prevPrevChar))) {
0103                     mTrailingWS = true;
0104                 }
0105             }
0106             currentLineLength = 0;
0107             break;
0108         case 'F': // check for lines starting with From_ if not found already:
0109             if (!mLeadingFrom) {
0110                 if (prevChar == '\n' && end - it >= 5 &&
0111                         !qstrncmp("From ", it, 5)) {
0112                     mLeadingFrom = true;
0113                 }
0114             }
0115             ++mPrintable;
0116             break;
0117         default: {
0118             uchar c = *it;
0119             if (c == '\t' || (c >= ' ' && c <= '~')) {
0120                 ++mPrintable;
0121             } else if (c == 127 || c < ' ') {
0122                 ++mCTL;
0123             } else {
0124                 ++mEightBit;
0125             }
0126         }
0127         }
0128         prevPrevChar = prevChar;
0129         prevChar = *it;
0130     }
0131 
0132     // consider the length of the last line
0133     if (currentLineLength >= mLineMax) {
0134         mLineMax = currentLineLength;
0135     }
0136     if (currentLineLength <= mLineMin) {
0137         mLineMin = currentLineLength;
0138     }
0139 
0140     // check whether the last character is tab or space
0141     if (isWS(prevChar)) {
0142         mTrailingWS = true;
0143     }
0144 
0145     mTotal = len;
0146 }
0147 
0148 bool CharFreq::isEightBitData() const
0149 {
0150     return type() == EightBitData;
0151 }
0152 
0153 bool CharFreq::isEightBitText() const
0154 {
0155     return type() == EightBitText;
0156 }
0157 
0158 bool CharFreq::isSevenBitData() const
0159 {
0160     return type() == SevenBitData;
0161 }
0162 
0163 bool CharFreq::isSevenBitText() const
0164 {
0165     return type() == SevenBitText;
0166 }
0167 
0168 bool CharFreq::hasTrailingWhitespace() const
0169 {
0170     return mTrailingWS;
0171 }
0172 
0173 bool CharFreq::hasLeadingFrom() const
0174 {
0175     return mLeadingFrom;
0176 }
0177 
0178 CharFreq::Type CharFreq::type() const
0179 {
0180 #if 0
0181     qCDebug(KMIME_LOG)("Total: %d; NUL: %d; CTL: %d;\n"
0182            "CR: %d; LF: %d; CRLF: %d;\n"
0183            "lineMin: %d; lineMax: %d;\n"
0184            "printable: %d; eightBit: %d;\n"
0185            "trailing whitespace: %s;\n"
0186            "leading 'From ': %s;\n",
0187            total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax,
0188            printable, eightBit,
0189            mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no");
0190 #endif
0191     if (mNUL) {   // must be binary
0192         return Binary;
0193     }
0194 
0195     // doesn't contain NUL's:
0196     if (mEightBit) {
0197         if (mLineMax > 988) {
0198             return EightBitData; // not allowed in 8bit
0199         }
0200         if ((mLF != mCRLF && mCRLF > 0) || mCR != mCRLF || controlCodesRatio() > 0.2) {
0201             return EightBitData;
0202         }
0203         return EightBitText;
0204     }
0205 
0206     // doesn't contain NUL's, nor 8bit chars:
0207     if (mLineMax > 988) {
0208         return SevenBitData;
0209     }
0210     if ((mLF != mCRLF && mCRLF > 0) || mCR != mCRLF || controlCodesRatio() > 0.2) {
0211         return SevenBitData;
0212     }
0213 
0214     // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars:
0215     return SevenBitText;
0216 }
0217 
0218 float CharFreq::printableRatio() const
0219 {
0220     if (mTotal) {
0221         return float(mPrintable) / float(mTotal);
0222     } else {
0223         return 0;
0224     }
0225 }
0226 
0227 float CharFreq::controlCodesRatio() const
0228 {
0229     if (mTotal) {
0230         return float(mCTL) / float(mTotal);
0231     } else {
0232         return 0;
0233     }
0234 }
0235