File indexing completed on 2024-05-05 13:52:23
0001 /* -*- c++ -*- 0002 kmime_charfreq.h 0003 0004 KMime, the KDE Internet mail/usenet news message library. 0005 SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org> 0006 0007 SPDX-License-Identifier: LGPL-2.0-or-later 0008 */ 0009 /** 0010 @file 0011 This file is part of the API for handling @ref MIME data and 0012 defines the CharFreq class. 0013 0014 @brief 0015 Defines the CharFreq class. 0016 0017 @authors Marc Mutz \<mutz@kde.org\> 0018 0019 @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit: 0020 Data that contains bytes with at least one value greater than 127, or at 0021 least one NUL byte. 0022 0023 @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary: 0024 Eight-bit data that contains a high percentage of non-ascii values, 0025 or lines longer than 998 characters, or stray CRs, or NULs. 0026 0027 @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text: 0028 Eight-bit data that contains a high percentage of ascii values, 0029 no lines longer than 998 characters, no NULs, and either only LFs or 0030 only CRLFs. 0031 0032 @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit: 0033 Data that contains bytes with all values less than 128, and no NULs. 0034 0035 @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary: 0036 Seven-bit data that contains a high percentage of non-ascii values, 0037 or lines longer than 998 characters, or stray CRs. 0038 0039 @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text: 0040 Seven-bit data that contains a high percentage of ascii values, 0041 no lines longer than 998 characters, and either only LFs, or only CRLFs. 0042 */ 0043 0044 #pragma once 0045 0046 #include <QByteArray> 0047 #undef None 0048 0049 #include <limits> 0050 0051 namespace KMime 0052 { 0053 0054 /** 0055 @brief 0056 A class for performing basic data typing using frequency count heuristics. 0057 0058 This class performs character frequency counts on the provided data which 0059 are used in heuristics to determine a basic data type. The data types are: 0060 0061 - @ref Eight-Bit-Binary 0062 - @ref Eight-Bit-Text 0063 - @ref Seven-Bit-Binary 0064 - @ref Seven-Bit-Text 0065 */ 0066 class CharFreq 0067 { 0068 public: 0069 /** 0070 Constructs a Character Frequency instance for a buffer @p buf of 0071 QByteArray data. 0072 0073 @param buf is a QByteArray containing the data. 0074 */ 0075 explicit CharFreq(QByteArrayView buf); 0076 0077 /** 0078 The different types of data. 0079 */ 0080 enum Type { 0081 None = 0, /**< Unknown */ 0082 EightBitData, /**< 8bit binary */ 0083 Binary = EightBitData, /**< 8bit binary */ 0084 SevenBitData, /**< 7bit binary */ 0085 EightBitText, /**< 8bit text */ 0086 SevenBitText /**< 7bit text */ 0087 }; 0088 0089 /** 0090 Returns the data #Type as derived from the class heuristics. 0091 */ 0092 [[nodiscard]] Type type() const; 0093 0094 /** 0095 Returns true if the data #Type is EightBitData; false otherwise. 0096 */ 0097 [[nodiscard]] bool isEightBitData() const; 0098 0099 /** 0100 Returns true if the data #Type is EightBitText; false otherwise. 0101 */ 0102 [[nodiscard]] bool isEightBitText() const; 0103 0104 /** 0105 Returns true if the data #Type is SevenBitData; false otherwise. 0106 */ 0107 [[nodiscard]] bool isSevenBitData() const; 0108 0109 /** 0110 Returns true if the data #Type is SevenBitText; false otherwise. 0111 */ 0112 [[nodiscard]] bool isSevenBitText() const; 0113 0114 /** 0115 Returns true if the data contains trailing whitespace. i.e., 0116 if any line ends with space (' ') or tab ('\\t'). 0117 */ 0118 [[nodiscard]] bool hasTrailingWhitespace() const; 0119 0120 /** 0121 Returns true if the data contains a line that starts with "From ". 0122 */ 0123 [[nodiscard]] bool hasLeadingFrom() const; 0124 0125 /** 0126 Returns the percentage of printable characters in the data. 0127 The result is undefined if the number of data characters is zero. 0128 */ 0129 [[nodiscard]] float printableRatio() const; 0130 0131 /** 0132 Returns the percentage of control code characters (CTLs) in the data. 0133 The result is undefined if the number of data characters is zero. 0134 */ 0135 [[nodiscard]] float controlCodesRatio() const; 0136 0137 private: 0138 uint mNUL = 0; // count of NUL chars 0139 uint mCTL = 0; // count of CTLs (incl. DEL, excl. CR, LF, HT) 0140 uint mCR = 0; // count of CR chars 0141 uint mLF = 0; // count of LF chars 0142 uint mCRLF = 0; // count of LFs, preceded by CRs 0143 uint mPrintable = 0; // count of printable US-ASCII chars (SPC..~) 0144 uint mEightBit = 0; // count of other latin1 chars (those with 8th bit set) 0145 uint mTotal = 0; // count of all chars 0146 uint mLineMin = std::numeric_limits<uint>::max(); // minimum line length 0147 uint mLineMax = 0; // maximum line length 0148 bool mTrailingWS = false; // does the buffer contain trailing whitespace? 0149 bool mLeadingFrom = false; // does the buffer contain lines starting with "From "? 0150 0151 /** 0152 Performs the character frequency counts on the data. 0153 0154 @param buf is a pointer to a character string containing the data. 0155 @param len is the length of @p buf, in characters. 0156 */ 0157 void count(const char *buf, size_t len); 0158 }; 0159 0160 } // namespace KMime 0161