File indexing completed on 2024-04-21 05:18:10

0001 /*  -*- c++ -*-
0002     kmime_charfreq.h
0003 
0004     KMime, the KDE Internet mail/usenet news message library.
0005     SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
0006 
0007     SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009 /**
0010   @file
0011   This file is part of the API for handling @ref MIME data and
0012   defines the CharFreq class.
0013 
0014   @brief
0015   Defines the CharFreq class.
0016 
0017   @authors Marc Mutz \<mutz@kde.org\>
0018 
0019   @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit:
0020   Data that contains bytes with at least one value greater than 127, or at
0021   least one NUL byte.
0022 
0023   @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary:
0024   Eight-bit data that contains a high percentage of non-ascii values,
0025   or lines longer than 998 characters, or stray CRs, or NULs.
0026 
0027   @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text:
0028   Eight-bit data that contains a high percentage of ascii values,
0029   no lines longer than 998 characters, no NULs, and either only LFs or
0030   only CRLFs.
0031 
0032   @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit:
0033   Data that contains bytes with all values less than 128, and no NULs.
0034 
0035   @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary:
0036   Seven-bit data that contains a high percentage of non-ascii values,
0037   or lines longer than 998 characters, or stray CRs.
0038 
0039   @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text:
0040   Seven-bit data that contains a high percentage of ascii values,
0041   no lines longer than 998 characters, and either only LFs, or only CRLFs.
0042 */
0043 
0044 #pragma once
0045 
0046 #include <QByteArray>
0047 #undef None
0048 
0049 #include <limits>
0050 
0051 namespace KMime
0052 {
0053 
0054 /**
0055   @brief
0056   A class for performing basic data typing using frequency count heuristics.
0057 
0058   This class performs character frequency counts on the provided data which
0059   are used in heuristics to determine a basic data type.  The data types are:
0060 
0061   - @ref Eight-Bit-Binary
0062   - @ref Eight-Bit-Text
0063   - @ref Seven-Bit-Binary
0064   - @ref Seven-Bit-Text
0065 */
0066 class CharFreq
0067 {
0068 public:
0069     /**
0070       Constructs a Character Frequency instance for a buffer @p buf of
0071       QByteArray data.
0072 
0073       @param buf is a QByteArray containing the data.
0074     */
0075     explicit CharFreq(QByteArrayView buf);
0076 
0077     /**
0078       The different types of data.
0079     */
0080     enum Type {
0081         None = 0,              /**< Unknown */
0082         EightBitData,          /**< 8bit binary */
0083         Binary = EightBitData, /**< 8bit binary */
0084         SevenBitData,          /**< 7bit binary */
0085         EightBitText,          /**< 8bit text */
0086         SevenBitText           /**< 7bit text */
0087     };
0088 
0089     /**
0090       Returns the data #Type as derived from the class heuristics.
0091     */
0092     [[nodiscard]] Type type() const;
0093 
0094     /**
0095       Returns true if the data #Type is EightBitData; false otherwise.
0096     */
0097     [[nodiscard]] bool isEightBitData() const;
0098 
0099     /**
0100       Returns true if the data #Type is EightBitText; false otherwise.
0101     */
0102     [[nodiscard]] bool isEightBitText() const;
0103 
0104     /**
0105       Returns true if the data #Type is SevenBitData; false otherwise.
0106     */
0107     [[nodiscard]] bool isSevenBitData() const;
0108 
0109     /**
0110       Returns true if the data #Type is SevenBitText; false otherwise.
0111     */
0112     [[nodiscard]] bool isSevenBitText() const;
0113 
0114     /**
0115       Returns true if the data contains trailing whitespace. i.e.,
0116       if any line ends with space (' ') or tab ('\\t').
0117     */
0118     [[nodiscard]] bool hasTrailingWhitespace() const;
0119 
0120     /**
0121       Returns true if the data contains a line that starts with "From ".
0122     */
0123     [[nodiscard]] bool hasLeadingFrom() const;
0124 
0125     /**
0126       Returns the percentage of printable characters in the data.
0127       The result is undefined if the number of data characters is zero.
0128     */
0129     [[nodiscard]] float printableRatio() const;
0130 
0131     /**
0132       Returns the percentage of control code characters (CTLs) in the data.
0133       The result is undefined if the number of data characters is zero.
0134     */
0135     [[nodiscard]] float controlCodesRatio() const;
0136 
0137   private:
0138     uint mNUL = 0;       // count of NUL chars
0139     uint mCTL = 0;       // count of CTLs (incl. DEL, excl. CR, LF, HT)
0140     uint mCR = 0;        // count of CR chars
0141     uint mLF = 0;        // count of LF chars
0142     uint mCRLF = 0;      // count of LFs, preceded by CRs
0143     uint mPrintable = 0; // count of printable US-ASCII chars (SPC..~)
0144     uint mEightBit = 0;  // count of other latin1 chars (those with 8th bit set)
0145     uint mTotal = 0;     // count of all chars
0146     uint mLineMin = std::numeric_limits<uint>::max(); // minimum line length
0147     uint mLineMax = 0;   // maximum line length
0148     bool mTrailingWS = false;  // does the buffer contain trailing whitespace?
0149     bool mLeadingFrom = false; // does the buffer contain lines starting with "From "?
0150 
0151     /**
0152       Performs the character frequency counts on the data.
0153 
0154       @param buf is a pointer to a character string containing the data.
0155       @param len is the length of @p buf, in characters.
0156     */
0157     void count(const char *buf, size_t len);
0158 };
0159 
0160 } // namespace KMime
0161