File indexing completed on 2024-09-15 11:55:02

0001 /*
0002     This file is part of the KDE libraries
0003 
0004     SPDX-FileCopyrightText: 2008 Wang Hoi <zealot.hoi@gmail.com>
0005 
0006     SPDX-License-Identifier: LGPL-2.0-or-later
0007 */
0008 #ifndef KENCODINGPROBER_H
0009 #define KENCODINGPROBER_H
0010 
0011 // enable debug of private probers
0012 // #define DEBUG_PROBE
0013 
0014 #include <kcodecs_export.h>
0015 
0016 #ifdef DEBUG_PROBE
0017 #include <QDebug>
0018 #endif
0019 
0020 #include <QCoreApplication>
0021 #include <QString>
0022 #include <memory>
0023 
0024 class KEncodingProberPrivate;
0025 
0026 /**
0027  * @class KEncodingProber kencodingprober.h KEncodingProber
0028  *
0029  * @short Provides encoding detection(probe) capabilities.
0030  *
0031  * Probe the encoding of raw data only.
0032  * In the case it can't find it, return the most possible encoding it guessed.
0033  *
0034  * Always do Unicode probe regardless the ProberType
0035  *
0036  * Feed data to it several times with feed() until ProberState changes to FoundIt/NotMe,
0037  * or confidence() returns a value you find acceptable.
0038  *
0039  * Intended lifetime of the object: one instance per ProberType.
0040  *
0041  * Typical use:
0042  * \code
0043  * QByteArray data, moredata;
0044  * ...
0045  * KEncodingProber prober(KEncodingProber::Chinese);
0046  * prober.feed(data);
0047  * prober.feed(moredata);
0048  * if (prober.confidence() > 0.6)
0049  *    encoding  = prober.encoding();
0050  * \endcode
0051  *
0052  * At least 256 characters are needed to change the ProberState from Probing to FoundIt.
0053  * If you don't have so many characters to probe,
0054  * decide whether to accept the encoding it guessed so far according to the Confidence by yourself.
0055  *
0056  * @short Guess encoding of char array
0057  *
0058  */
0059 class KCODECS_EXPORT KEncodingProber
0060 {
0061     Q_DECLARE_TR_FUNCTIONS(KEncodingProber)
0062 
0063 public:
0064     enum ProberState {
0065         FoundIt, /**< Sure find the encoding */
0066         NotMe, /**< Sure not included in current ProberType's all supported encodings  */
0067         Probing, /**< Need more data to make a decision */
0068     };
0069 
0070     enum ProberType {
0071         None,
0072         Universal,
0073         Arabic,
0074         Baltic,
0075         CentralEuropean,
0076         ChineseSimplified,
0077         ChineseTraditional,
0078         Cyrillic,
0079         Greek,
0080         Hebrew,
0081         Japanese,
0082         Korean,
0083         NorthernSaami,
0084         Other,
0085         SouthEasternEurope,
0086         Thai,
0087         Turkish,
0088         Unicode,
0089         WesternEuropean,
0090     };
0091 
0092     /**
0093      * Default ProberType is Universal(detect all possible encodings)
0094      */
0095     KEncodingProber(ProberType proberType = Universal);
0096 
0097     ~KEncodingProber();
0098 
0099     KEncodingProber(const KEncodingProber &) = delete;
0100     KEncodingProber &operator=(const KEncodingProber &) = delete;
0101 
0102     /**
0103      * reset the prober's internal state and data.
0104      */
0105     void reset();
0106 
0107     /**
0108      * The main class method
0109      *
0110      * feed data to the prober
0111      *
0112      * @returns the ProberState after probing the fed data.
0113      */
0114     ProberState feed(const QByteArray &data);
0115     ProberState feed(const char *data, int len);
0116 
0117     /**
0118      * @returns the prober's current ProberState
0119      *
0120      */
0121     ProberState state() const;
0122 
0123     /**
0124      * @returns a QByteArray with the name of the best encoding it has guessed so far
0125      * @since 4.2.2
0126      */
0127     QByteArray encoding() const;
0128 
0129     /**
0130      * @returns the confidence(sureness) of encoding it guessed so far (0.0 ~ 0.99), not very reliable for single byte encodings
0131      */
0132     float confidence() const;
0133 
0134     ProberType proberType() const;
0135 
0136     /**
0137      * change current prober's ProberType and reset the prober
0138      */
0139     void setProberType(ProberType proberType);
0140 
0141     /**
0142      * @return the ProberType for lang (eg. proberTypeForName("Chinese Simplified") will return KEncodingProber::ChineseSimplified
0143      */
0144     static ProberType proberTypeForName(const QString &lang);
0145 
0146     /**
0147      * map ProberType to language string
0148      */
0149     static QString nameForProberType(ProberType proberType);
0150 
0151 private:
0152     std::unique_ptr<KEncodingProberPrivate> const d;
0153 };
0154 
0155 #endif