File indexing completed on 2024-03-24 05:21:16
0001 /* 0002 kmime_charfreq.cpp 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 0010 /** 0011 @file 0012 This file is part of the API for handling MIME data and 0013 defines the CharFreq class. 0014 0015 @brief 0016 Defines the CharFreq class. 0017 0018 @authors Marc Mutz \<mutz@kde.org\> 0019 */ 0020 0021 #include "kmime_charfreq_p.h" 0022 #include "kmime_debug.h" 0023 0024 using namespace KMime; 0025 0026 CharFreq::CharFreq(QByteArrayView buf) 0027 : mNUL(0), 0028 mCTL(0), 0029 mCR(0), mLF(0), 0030 mCRLF(0), 0031 mPrintable(0), 0032 mEightBit(0), 0033 mTotal(0), 0034 mLineMin(0xffffffff), 0035 mLineMax(0) 0036 { 0037 if (!buf.isEmpty()) { 0038 count(buf.data(), buf.size()); 0039 } 0040 } 0041 0042 static inline bool isWS(char ch) 0043 { 0044 return (ch == '\t' || ch == ' '); 0045 } 0046 0047 void CharFreq::count(const char *it, size_t len) 0048 { 0049 const char *end = it + len; 0050 uint currentLineLength = 0; 0051 // initialize the prevChar with LF so that From_ detection works w/o 0052 // special-casing: 0053 char prevChar = '\n'; 0054 char prevPrevChar = 0; 0055 0056 for (; it != end ; ++it) { 0057 ++currentLineLength; 0058 switch (*it) { 0059 case '\0': ++mNUL; break; 0060 case '\r': ++mCR; break; 0061 case '\n': ++mLF; 0062 if (prevChar == '\r') { 0063 --currentLineLength; ++mCRLF; 0064 } 0065 if (currentLineLength >= mLineMax) { 0066 mLineMax = currentLineLength - 1; 0067 } 0068 if (currentLineLength <= mLineMin) { 0069 mLineMin = currentLineLength - 1; 0070 } 0071 if (!mTrailingWS) { 0072 if (isWS(prevChar) || 0073 (prevChar == '\r' && isWS(prevPrevChar))) { 0074 mTrailingWS = true; 0075 } 0076 } 0077 currentLineLength = 0; 0078 break; 0079 case 'F': // check for lines starting with From_ if not found already: 0080 if (!mLeadingFrom) { 0081 if (prevChar == '\n' && end - it >= 5 && 0082 !qstrncmp("From ", it, 5)) { 0083 mLeadingFrom = true; 0084 } 0085 } 0086 ++mPrintable; 0087 break; 0088 default: { 0089 uchar c = *it; 0090 if (c == '\t' || (c >= ' ' && c <= '~')) { 0091 ++mPrintable; 0092 } else if (c == 127 || c < ' ') { 0093 ++mCTL; 0094 } else { 0095 ++mEightBit; 0096 } 0097 } 0098 } 0099 prevPrevChar = prevChar; 0100 prevChar = *it; 0101 } 0102 0103 // consider the length of the last line 0104 if (currentLineLength >= mLineMax) { 0105 mLineMax = currentLineLength; 0106 } 0107 if (currentLineLength <= mLineMin) { 0108 mLineMin = currentLineLength; 0109 } 0110 0111 // check whether the last character is tab or space 0112 if (isWS(prevChar)) { 0113 mTrailingWS = true; 0114 } 0115 0116 mTotal = len; 0117 } 0118 0119 bool CharFreq::isEightBitData() const 0120 { 0121 return type() == EightBitData; 0122 } 0123 0124 bool CharFreq::isEightBitText() const 0125 { 0126 return type() == EightBitText; 0127 } 0128 0129 bool CharFreq::isSevenBitData() const 0130 { 0131 return type() == SevenBitData; 0132 } 0133 0134 bool CharFreq::isSevenBitText() const 0135 { 0136 return type() == SevenBitText; 0137 } 0138 0139 bool CharFreq::hasTrailingWhitespace() const 0140 { 0141 return mTrailingWS; 0142 } 0143 0144 bool CharFreq::hasLeadingFrom() const 0145 { 0146 return mLeadingFrom; 0147 } 0148 0149 CharFreq::Type CharFreq::type() const 0150 { 0151 #if 0 0152 qCDebug(KMIME_LOG)("Total: %d; NUL: %d; CTL: %d;\n" 0153 "CR: %d; LF: %d; CRLF: %d;\n" 0154 "lineMin: %d; lineMax: %d;\n" 0155 "printable: %d; eightBit: %d;\n" 0156 "trailing whitespace: %s;\n" 0157 "leading 'From ': %s;\n", 0158 total, NUL, CTL, CR, LF, CRLF, lineMin, lineMax, 0159 printable, eightBit, 0160 mTrailingWS ? "yes" : "no" , mLeadingFrom ? "yes" : "no"); 0161 #endif 0162 if (mNUL) { // must be binary 0163 return Binary; 0164 } 0165 0166 // doesn't contain NUL's: 0167 if (mEightBit) { 0168 if (mLineMax > 988) { 0169 return EightBitData; // not allowed in 8bit 0170 } 0171 if ((mLF != mCRLF && mCRLF > 0) || mCR != mCRLF || controlCodesRatio() > 0.2) { 0172 return EightBitData; 0173 } 0174 return EightBitText; 0175 } 0176 0177 // doesn't contain NUL's, nor 8bit chars: 0178 if (mLineMax > 988) { 0179 return SevenBitData; 0180 } 0181 if ((mLF != mCRLF && mCRLF > 0) || mCR != mCRLF || controlCodesRatio() > 0.2) { 0182 return SevenBitData; 0183 } 0184 0185 // no NUL, no 8bit chars, no excessive CTLs and no lines > 998 chars: 0186 return SevenBitText; 0187 } 0188 0189 float CharFreq::printableRatio() const 0190 { 0191 if (mTotal) { 0192 return float(mPrintable) / float(mTotal); 0193 } else { 0194 return 0; 0195 } 0196 } 0197 0198 float CharFreq::controlCodesRatio() const 0199 { 0200 if (mTotal) { 0201 return float(mCTL) / float(mTotal); 0202 } else { 0203 return 0; 0204 } 0205 } 0206