File indexing completed on 2023-09-24 09:25:00
0001 /* -*- c++ -*- 0002 kmime_charfreq.h 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 /** 0010 @file 0011 This file is part of the API for handling @ref MIME data and 0012 defines the CharFreq class. 0013 0014 @brief 0015 Defines the CharFreq class. 0016 0017 @authors Marc Mutz \<mutz@kde.org\> 0018 0019 @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit: 0020 Data that contains bytes with at least one value greater than 127, or at 0021 least one NUL byte. 0022 0023 @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary: 0024 Eight-bit data that contains a high percentage of non-ascii values, 0025 or lines longer than 998 characters, or stray CRs, or NULs. 0026 0027 @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text: 0028 Eight-bit data that contains a high percentage of ascii values, 0029 no lines longer than 998 characters, no NULs, and either only LFs or 0030 only CRLFs. 0031 0032 @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit: 0033 Data that contains bytes with all values less than 128, and no NULs. 0034 0035 @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary: 0036 Seven-bit data that contains a high percentage of non-ascii values, 0037 or lines longer than 998 characters, or stray CRs. 0038 0039 @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text: 0040 Seven-bit data that contains a high percentage of ascii values, 0041 no lines longer than 998 characters, and either only LFs, or only CRLFs. 0042 */ 0043 0044 #pragma once 0045 0046 #include <QByteArray> 0047 #undef None 0048 0049 namespace KMime 0050 { 0051 0052 /** 0053 @brief 0054 A class for performing basic data typing using frequency count heuristics. 0055 0056 This class performs character frequency counts on the provided data which 0057 are used in heuristics to determine a basic data type. The data types are: 0058 0059 - @ref Eight-Bit-Binary 0060 - @ref Eight-Bit-Text 0061 - @ref Seven-Bit-Binary 0062 - @ref Seven-Bit-Text 0063 */ 0064 class CharFreq 0065 { 0066 public: 0067 /** 0068 Constructs a Character Frequency instance for a buffer @p buf of 0069 QByteArray data. 0070 0071 @param buf is a QByteArray containing the data. 0072 */ 0073 explicit CharFreq(const QByteArray &buf); 0074 0075 /** 0076 Constructs a Character Frequency instance for a buffer @p buf of 0077 chars of length @p len. 0078 0079 @param buf is a pointer to a character string containing the data. 0080 @param len is the length of @p buf, in characters. 0081 */ 0082 CharFreq(const char *buf, size_t len); 0083 0084 /** 0085 The different types of data. 0086 */ 0087 enum Type { 0088 None = 0, /**< Unknown */ 0089 EightBitData, /**< 8bit binary */ 0090 Binary = EightBitData, /**< 8bit binary */ 0091 SevenBitData, /**< 7bit binary */ 0092 EightBitText, /**< 8bit text */ 0093 SevenBitText /**< 7bit text */ 0094 }; 0095 0096 /** 0097 Returns the data #Type as derived from the class heuristics. 0098 */ 0099 Q_REQUIRED_RESULT Type type() const; 0100 0101 /** 0102 Returns true if the data #Type is EightBitData; false otherwise. 0103 */ 0104 Q_REQUIRED_RESULT bool isEightBitData() const; 0105 0106 /** 0107 Returns true if the data #Type is EightBitText; false otherwise. 0108 */ 0109 Q_REQUIRED_RESULT bool isEightBitText() const; 0110 0111 /** 0112 Returns true if the data #Type is SevenBitData; false otherwise. 0113 */ 0114 Q_REQUIRED_RESULT bool isSevenBitData() const; 0115 0116 /** 0117 Returns true if the data #Type is SevenBitText; false otherwise. 0118 */ 0119 Q_REQUIRED_RESULT bool isSevenBitText() const; 0120 0121 /** 0122 Returns true if the data contains trailing whitespace. i.e., 0123 if any line ends with space (' ') or tab ('\\t'). 0124 */ 0125 Q_REQUIRED_RESULT bool hasTrailingWhitespace() const; 0126 0127 /** 0128 Returns true if the data contains a line that starts with "From ". 0129 */ 0130 Q_REQUIRED_RESULT bool hasLeadingFrom() const; 0131 0132 /** 0133 Returns the percentage of printable characters in the data. 0134 The result is undefined if the number of data characters is zero. 0135 */ 0136 Q_REQUIRED_RESULT float printableRatio() const; 0137 0138 /** 0139 Returns the percentage of control code characters (CTLs) in the data. 0140 The result is undefined if the number of data characters is zero. 0141 */ 0142 Q_REQUIRED_RESULT float controlCodesRatio() const; 0143 0144 private: 0145 //@cond PRIVATE 0146 uint mNUL; // count of NUL chars 0147 uint mCTL; // count of CTLs (incl. DEL, excl. CR, LF, HT) 0148 uint mCR; // count of CR chars 0149 uint mLF; // count of LF chars 0150 uint mCRLF; // count of LFs, preceded by CRs 0151 uint mPrintable; // count of printable US-ASCII chars (SPC..~) 0152 uint mEightBit; // count of other latin1 chars (those with 8th bit set) 0153 uint mTotal; // count of all chars 0154 uint mLineMin; // minimum line length 0155 uint mLineMax; // maximum line length 0156 bool mTrailingWS = false; // does the buffer contain trailing whitespace? 0157 bool mLeadingFrom = false; // does the buffer contain lines starting with "From "? 0158 //@endcond 0159 0160 /** 0161 Performs the character frequency counts on the data. 0162 0163 @param buf is a pointer to a character string containing the data. 0164 @param len is the length of @p buf, in characters. 0165 */ 0166 void count(const char *buf, size_t len); 0167 }; 0168 0169 } // namespace KMime 0170