File indexing completed on 2023-09-24 09:25:00

0001 /*  -*- c++ -*-
0002     kmime_charfreq.h
0003 
0004     KMime, the KDE Internet mail/usenet news message library.
0005     SPDX-FileCopyrightText: 2001-2002 Marc Mutz <mutz@kde.org>
0006 
0007     SPDX-License-Identifier: LGPL-2.0-or-later
0008 */
0009 /**
0010   @file
0011   This file is part of the API for handling @ref MIME data and
0012   defines the CharFreq class.
0013 
0014   @brief
0015   Defines the CharFreq class.
0016 
0017   @authors Marc Mutz \<mutz@kde.org\>
0018 
0019   @glossary @anchor Eight-Bit @anchor eight-bit @b 8-bit:
0020   Data that contains bytes with at least one value greater than 127, or at
0021   least one NUL byte.
0022 
0023   @glossary @anchor Eight-Bit-Binary @anchor eight-bit-binary @b 8-bit-binary:
0024   Eight-bit data that contains a high percentage of non-ascii values,
0025   or lines longer than 998 characters, or stray CRs, or NULs.
0026 
0027   @glossary @anchor Eight-Bit-Text @anchor eight-bit-text @b 8-bit-text:
0028   Eight-bit data that contains a high percentage of ascii values,
0029   no lines longer than 998 characters, no NULs, and either only LFs or
0030   only CRLFs.
0031 
0032   @glossary @anchor Seven-Bit @anchor seven-bit @b 7-Bit:
0033   Data that contains bytes with all values less than 128, and no NULs.
0034 
0035   @glossary @anchor Seven-Bit-Binary @anchor seven-bit-binary @b 7-bit-binary:
0036   Seven-bit data that contains a high percentage of non-ascii values,
0037   or lines longer than 998 characters, or stray CRs.
0038 
0039   @glossary @anchor Seven-Bit-Text @anchor seven-bit-text @b 7-bit-text:
0040   Seven-bit data that contains a high percentage of ascii values,
0041   no lines longer than 998 characters, and either only LFs, or only CRLFs.
0042 */
0043 
0044 #pragma once
0045 
0046 #include <QByteArray>
0047 #undef None
0048 
0049 namespace KMime
0050 {
0051 
0052 /**
0053   @brief
0054   A class for performing basic data typing using frequency count heuristics.
0055 
0056   This class performs character frequency counts on the provided data which
0057   are used in heuristics to determine a basic data type.  The data types are:
0058 
0059   - @ref Eight-Bit-Binary
0060   - @ref Eight-Bit-Text
0061   - @ref Seven-Bit-Binary
0062   - @ref Seven-Bit-Text
0063 */
0064 class CharFreq
0065 {
0066 public:
0067     /**
0068       Constructs a Character Frequency instance for a buffer @p buf of
0069       QByteArray data.
0070 
0071       @param buf is a QByteArray containing the data.
0072     */
0073     explicit CharFreq(const QByteArray &buf);
0074 
0075     /**
0076       Constructs a Character Frequency instance for a buffer @p buf of
0077       chars of length @p len.
0078 
0079       @param buf is a pointer to a character string containing the data.
0080       @param len is the length of @p buf, in characters.
0081     */
0082     CharFreq(const char *buf, size_t len);
0083 
0084     /**
0085       The different types of data.
0086     */
0087     enum Type {
0088         None = 0,              /**< Unknown */
0089         EightBitData,          /**< 8bit binary */
0090         Binary = EightBitData, /**< 8bit binary */
0091         SevenBitData,          /**< 7bit binary */
0092         EightBitText,          /**< 8bit text */
0093         SevenBitText           /**< 7bit text */
0094     };
0095 
0096     /**
0097       Returns the data #Type as derived from the class heuristics.
0098     */
0099     Q_REQUIRED_RESULT Type type() const;
0100 
0101     /**
0102       Returns true if the data #Type is EightBitData; false otherwise.
0103     */
0104     Q_REQUIRED_RESULT bool isEightBitData() const;
0105 
0106     /**
0107       Returns true if the data #Type is EightBitText; false otherwise.
0108     */
0109     Q_REQUIRED_RESULT bool isEightBitText() const;
0110 
0111     /**
0112       Returns true if the data #Type is SevenBitData; false otherwise.
0113     */
0114     Q_REQUIRED_RESULT bool isSevenBitData() const;
0115 
0116     /**
0117       Returns true if the data #Type is SevenBitText; false otherwise.
0118     */
0119     Q_REQUIRED_RESULT bool isSevenBitText() const;
0120 
0121     /**
0122       Returns true if the data contains trailing whitespace. i.e.,
0123       if any line ends with space (' ') or tab ('\\t').
0124     */
0125     Q_REQUIRED_RESULT bool hasTrailingWhitespace() const;
0126 
0127     /**
0128       Returns true if the data contains a line that starts with "From ".
0129     */
0130     Q_REQUIRED_RESULT bool hasLeadingFrom() const;
0131 
0132     /**
0133       Returns the percentage of printable characters in the data.
0134       The result is undefined if the number of data characters is zero.
0135     */
0136     Q_REQUIRED_RESULT float printableRatio() const;
0137 
0138     /**
0139       Returns the percentage of control code characters (CTLs) in the data.
0140       The result is undefined if the number of data characters is zero.
0141     */
0142     Q_REQUIRED_RESULT float controlCodesRatio() const;
0143 
0144 private:
0145     //@cond PRIVATE
0146     uint mNUL;         // count of NUL chars
0147     uint mCTL;         // count of CTLs (incl. DEL, excl. CR, LF, HT)
0148     uint mCR;          // count of CR chars
0149     uint mLF;          // count of LF chars
0150     uint mCRLF;        // count of LFs, preceded by CRs
0151     uint mPrintable;   // count of printable US-ASCII chars (SPC..~)
0152     uint mEightBit;    // count of other latin1 chars (those with 8th bit set)
0153     uint mTotal;       // count of all chars
0154     uint mLineMin;     // minimum line length
0155     uint mLineMax;     // maximum line length
0156     bool mTrailingWS = false;  // does the buffer contain trailing whitespace?
0157     bool mLeadingFrom = false; // does the buffer contain lines starting with "From "?
0158     //@endcond
0159 
0160     /**
0161       Performs the character frequency counts on the data.
0162 
0163       @param buf is a pointer to a character string containing the data.
0164       @param len is the length of @p buf, in characters.
0165     */
0166     void count(const char *buf, size_t len);
0167 };
0168 
0169 } // namespace KMime
0170