File indexing completed on 2024-10-06 06:39:18

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
0005 
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008 #ifndef KENCODINGPROBER_H
0009 #define KENCODINGPROBER_H
0010 
0011 // enable debug of private probers
0012 // #define DEBUG_PROBE
0013 
0014 #include <kcodecs_export.h>
0015 
0016 #ifdef DEBUG_PROBE
0017 #include <QDebug>
0018 #endif
0019 
0020 #include <QCoreApplication>
0021 #include <QString>
0022 #include <memory>
0023 
0024 class KEncodingProberPrivate;
0025 
0026 /**
0027  * @class KEncodingProber kencodingprober.h KEncodingProber
0028  *
0029  * @short Provides encoding detection(probe) capabilities.
0030  *
0031  * Probe the encoding of raw data only.
0032  * In the case it can't find it, return the most possible encoding it guessed.
0033  *
0034  * Always do Unicode probe regardless the ProberType
0035  *
0036  * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
0037  * or confidence() returns a value you find acceptable.
0038  *
0039  * Intended lifetime of the object: one instance per ProberType.
0040  *
0041  * Typical use:
0042  * \code
0043  * QByteArray data, moredata;
0044  * ...
0045  * KEncodingProber prober(KEncodingProber::Chinese);
0046  * prober.feed(data);
0047  * prober.feed(moredata);
0048  * if (prober.confidence() > 0.6)
0049  *    encoding  = prober.encoding();
0050  * \endcode
0051  *
0052  * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
0053  * If you don't have so many characters to probe,
0054  * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
0055  *
0056  * @short Guess encoding of char array
0057  *
0058  */
0059 class KCODECS_EXPORT KEncodingProber
0060 {
0061     Q_DECLARE_TR_FUNCTIONS(KEncodingProber)
0062 
0063 public:
0064     enum ProberState {
0065         FoundIt, /**< Sure find the encoding */
0066         NotMe, /**< Sure not included in current ProberType's all supported encodings  */
0067         Probing, /**< Need more data to make a decision */
0068     };
0069 
0070     enum ProberType {
0071         None,
0072         Universal,
0073         Arabic,
0074         Baltic,
0075         CentralEuropean,
0076         ChineseSimplified,
0077         ChineseTraditional,
0078         Cyrillic,
0079         Greek,
0080         Hebrew,
0081         Japanese,
0082         Korean,
0083         NorthernSaami,
0084         Other,
0085         SouthEasternEurope,
0086         Thai,
0087         Turkish,
0088         Unicode,
0089         WesternEuropean,
0090     };
0091 
0092     /**
0093      * Default ProberType is Universal(detect all possible encodings)
0094      */
0095     KEncodingProber(ProberType proberType = Universal);
0096 
0097     ~KEncodingProber();
0098 
0099     KEncodingProber(const KEncodingProber &) = delete;
0100     KEncodingProber &operator=(const KEncodingProber &) = delete;
0101 
0102     /**
0103      * reset the prober's internal state and data.
0104      */
0105     void reset();
0106 
0107     /**
0108      * The main class method
0109      *
0110      * feed data to the prober
0111      *
0112      * @returns the ProberState after probing the fed data.
0113      */
0114     ProberState feed(QByteArrayView data);
0115     // for API compatibility
0116     inline ProberState feed(const char *data, qsizetype len)
0117     {
0118         return feed({data, len});
0119     }
0120 
0121     /**
0122      * @returns the prober's current ProberState
0123      *
0124      */
0125     ProberState state() const;
0126 
0127     /**
0128      * @returns a QByteArray with the name of the best encoding it has guessed so far
0129      * @since 4.2.2
0130      */
0131     QByteArray encoding() const;
0132 
0133     /**
0134      * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
0135      */
0136     float confidence() const;
0137 
0138     ProberType proberType() const;
0139 
0140     /**
0141      * change current prober's ProberType and reset the prober
0142      */
0143     void setProberType(ProberType proberType);
0144 
0145     /**
0146      * @return the ProberType for lang (e.g. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
0147      */
0148     static ProberType proberTypeForName(const QString &lang);
0149 
0150     /**
0151      * map ProberType to language string
0152      */
0153     static QString nameForProberType(ProberType proberType);
0154 
0155 private:
0156     std::unique_ptr<KEncodingProberPrivate> const d;
0157 };
0158 
0159 #endif