core/charcodecs/textcharcodec.cpp

0001 /*
0002     This file is part of the Okteta Core library, made within the KDE community.
0003
0004     SPDX-FileCopyrightText: 2004, 2011 Friedrich W. H. Kossebau <kossebau@kde.org>
0005
0006     SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
0007 */
0008
0009 #include "textcharcodec.hpp"
0010
0011 // lib
0012 #include <character.hpp>
0013 #include <logging.hpp>
0014 // Qt
0015 #include <QTextCodec>
0016
0017 namespace Okteta {
0018
0019 // static const char QTextCodecWhiteSpace = 63;
0020
0021 static constexpr struct EncodingData
0022 {
0023     CharCoding encodingId;
0024     const char* name;
0025 }
0026 encodingDataList[] =
0027 {
0028     { ISO8859_1Encoding, "ISO-8859-1" },
0029     { ISO8859_2Encoding, "ISO-8859-2" },
0030     { ISO8859_3Encoding, "ISO-8859-3" },
0031     { ISO8859_4Encoding, "ISO-8859-4" },
0032     { ISO8859_5Encoding, "ISO-8859-5" },
0033     { ISO8859_6Encoding, "ISO-8859-6" },
0034     { ISO8859_7Encoding, "ISO-8859-7" },
0035     { ISO8859_8Encoding, "ISO-8859-8" },
0036 //     { ISO8859_8_IEncoding, "ISO-8859-8-I" }, Qt (at least 5.15) delivers this variant already for ISO-8859-8
0037     { ISO8859_9Encoding, "ISO-8859-9" },
0038     { ISO8859_10Encoding, "ISO-8859-10" },
0039     { ISO8859_11Encoding, "TIS-620" }, // was: ISO-8859-11
0040     { ISO8859_13Encoding, "ISO-8859-13" },
0041     { ISO8859_14Encoding, "ISO-8859-14" },
0042     { ISO8859_15Encoding, "ISO-8859-15" },
0043     { ISO8859_16Encoding, "ISO-8859-16" },
0044     { CP1250Encoding, "windows-1250" },
0045     { CP1251Encoding, "windows-1251" },
0046     { CP1252Encoding, "windows-1252" },
0047     { CP1253Encoding, "windows-1253" },
0048     { CP1254Encoding, "windows-1254" },
0049     { CP1255Encoding, "windows-1255" },
0050     { CP1256Encoding, "windows-1256" },
0051     { CP1257Encoding, "windows-1257" },
0052     { CP1258Encoding, "windows-1258" },
0053     { IBM850Encoding, "IBM850" },
0054     { IBM866Encoding, "IBM866" },
0055 //     { IBM874Encoding, "IBM874" }, using our own IBM874CharCodec, see docs there
0056     { KOI8_REncoding, "KOI8-R" },
0057     { KOI8_UEncoding, "KOI8-U" }
0058 };
0059 // TODO: WS2
0060
0061 static bool is8Bit(QTextCodec* codec)
0062 {
0063     bool result = false;
0064
0065     const QByteArray& codecName = codec->name();
0066     for (const auto& encodingData : encodingDataList) {
0067         if (codecName.compare(encodingData.name) == 0) {
0068             result = true;
0069             break;
0070         }
0071     }
0072
0073     return result;
0074 }
0075
0076 static QTextCodec* createLatin1()
0077 {
0078     return QTextCodec::codecForName(encodingDataList[0].name);
0079 }
0080
0081 /* heuristic seems to be doomed :(
0082 static bool is8Bit( QTextCodec *Codec )
0083 {
0084     bool Result = true;
0085
0086     // first test different for 0
0087     unsigned char c[4];
0088     c[0] = 0;
0089     c[1] = c[2] = c[3] = 230;
0090     QString S = Codec->toUnicode( (const char*)&c,4 );
0091     int Length = 1;
0092     QCString CS = Codec->fromUnicode( S, Length );
0093     //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<Length ;
0094     if( Length > 0 )
0095         Result = false;
0096     // test if all chars survive the recoding
0097     else
0098     do {
0099         ++c[0];
0100         S = Codec->toUnicode( (const char*)&c,4 );
0101         Length = 1;
0102         CS = Codec->fromUnicode( S, Length );
0103         //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<c[0]<<"->"<<CS[0]<<":"<<Length ;
0104         if( Length != 1 || (CS[0] != (char)c[0] && CS[0] != QTextCodecWhiteSpace) ) {
0105             Result = false;
0106             break;
0107         }
0108     } while( c[0] < 255 );
0109     return Result;
0110 }
0111 const QStringList &TextCharCodec::codecNames()
0112     {
0113     // first call?
0114     if( CodecNames.isEmpty() ) {
0115         const QStringList &CharSets = KCharsets::charsets()->availableEncodingNames();
0116
0117         for( QStringList::ConstIterator it = CharSets.begin(); it != CharSets.end(); ++it ) {
0118             bool Found = true;
0119             QTextCodec* Codec = KCharsets::charsets()->codecForName( *it, Found );
0120             if( Found && is8Bit(Codec) )
0121                 CodecNames.append( QString::fromLatin1(Codec->name()) );
0122         }
0123     }
0124
0125     return CodecNames;
0126 }
0127
0128 QString TextCharCodec::nameOfEncoding( CharCoding _char )
0129 {
0130     TextCharCodec *Codec = 0;
0131
0132     const char* N = 0;
0133     for( unsigned int i=0; i<NoOfEncodings; ++i )
0134     {
0135         if( EncodingNames[i].Encoding == _char )
0136         {
0137             N = EncodingNames[i].Name;
0138             break;
0139         }
0140     }
0141
0142     if( N != 0 )
0143     {
0144         QString CodeName = QString::fromLatin1( N );
0145     }
0146     return Codec;
0147 }
0148  */
0149
0150 TextCharCodec* TextCharCodec::createLocalCodec()
0151 {
0152     QTextCodec* codec = QTextCodec::codecForLocale();
0153     if (!is8Bit(codec)) {
0154         codec = createLatin1();
0155     }
0156     return new TextCharCodec(codec);
0157 }
0158
0159 TextCharCodec* TextCharCodec::createCodec(const QString& codecName)
0160 {
0161     bool isOk = false;
0162     QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1());
0163     if (codec) {
0164         isOk = is8Bit(codec);
0165     }
0166     return isOk ? new TextCharCodec(codec) : nullptr;
0167 }
0168
0169 const QStringList& TextCharCodec::codecNames()
0170 {
0171     static QStringList textCodecNames;
0172
0173     // first call?
0174     if (textCodecNames.isEmpty()) {
0175         for (const auto& encodingData : encodingDataList) {
0176             QTextCodec* codec = QTextCodec::codecForName((encodingData.name));
0177             if (codec) {
0178                 textCodecNames.append(QString::fromLatin1(codec->name()));
0179             }
0180         }
0181     }
0182
0183     return textCodecNames;
0184 }
0185
0186 TextCharCodec::TextCharCodec(QTextCodec* textCodec)
0187     : mCodec(textCodec)
0188     , mDecoder(textCodec->makeDecoder())
0189     , mEncoder(textCodec->makeEncoder())
0190 {
0191 }
0192
0193 TextCharCodec::~TextCharCodec()
0194 {
0195     delete mDecoder;
0196     delete mEncoder;
0197 }
0198
0199 bool TextCharCodec::canEncode(QChar _char) const
0200 {
0201     return mCodec->canEncode(_char);
0202 }
0203
0204 bool TextCharCodec::encode(Byte* byte, QChar _char) const
0205 {
0206     if (!mCodec->canEncode(_char)) { // TODO: do we really need the codec?
0207         return false;
0208     }
0209
0210     const QByteArray encoded = mEncoder->fromUnicode(QString(_char));
0211     if (encoded.size() > 0) {
0212         *byte = encoded.at(0);
0213         return true;
0214     }
0215
0216     return false;
0217 }
0218
0219 Character TextCharCodec::decode(Byte byte) const
0220 {
0221     // QTextCodecs "use this codepoint when input data cannot be represented in Unicode." (Qt docs)
0222     constexpr QChar replacementChar = QChar(QChar::ReplacementCharacter);
0223     const QString string =
0224         mDecoder->toUnicode(reinterpret_cast<const char*>(&byte), 1);
0225     const QChar qchar = string.at(0);
0226     const bool isDecoded = (qchar != replacementChar);
0227     return {qchar, !isDecoded};
0228 }
0229
0230 QString TextCharCodec::name() const
0231 {
0232     if (mName.isNull()) {
0233         mName = QString::fromLatin1(mCodec->name());
0234     }
0235
0236     return mName;
0237 }
0238
0239 }