core/codecs/textcharcodec.cpp

0001 /*
0002     This file is part of the Okteta Core library, made within the KDE community.
0003
0004     SPDX-FileCopyrightText: 2004, 2011 Friedrich W. H. Kossebau <kossebau@kde.org>
0005
0006     SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
0007 */
0008
0009 #include "textcharcodec.hpp"
0010
0011 // lib
0012 #include <character.hpp>
0013 // KF
0014 #include <KCharsets>
0015 // Qt
0016 #include <QTextCodec>
0017
0018 namespace Okteta {
0019
0020 // static const char QTextCodecWhiteSpace = 63;
0021
0022 static constexpr struct EncodingData
0023 {
0024     CharCoding encodingId;
0025     const char* name;
0026 }
0027 encodingDataList[] =
0028 {
0029     { ISO8859_1Encoding, "ISO-8859-1" },
0030     { ISO8859_2Encoding, "ISO-8859-2" },
0031     { ISO8859_3Encoding, "ISO-8859-3" },
0032     { ISO8859_4Encoding, "ISO-8859-4" },
0033     { ISO8859_5Encoding, "ISO-8859-5" },
0034     { ISO8859_6Encoding, "ISO-8859-6" },
0035     { ISO8859_7Encoding, "ISO-8859-7" },
0036     { ISO8859_8Encoding, "ISO-8859-8" },
0037 //     { ISO8859_8_IEncoding, "ISO-8859-8-I" }, Qt (at least 5.15) delivers this variant already for ISO-8859-8
0038     { ISO8859_9Encoding, "ISO-8859-9" },
0039     { ISO8859_10Encoding, "ISO-8859-10" },
0040     { ISO8859_11Encoding, "TIS-620" }, // was: ISO-8859-11
0041     { ISO8859_13Encoding, "ISO-8859-13" },
0042     { ISO8859_14Encoding, "ISO-8859-14" },
0043     { ISO8859_15Encoding, "ISO-8859-15" },
0044     { ISO8859_16Encoding, "ISO-8859-16" },
0045     { CP1250Encoding, "windows-1250" },
0046     { CP1251Encoding, "windows-1251" },
0047     { CP1252Encoding, "windows-1252" },
0048     { CP1253Encoding, "windows-1253" },
0049     { CP1254Encoding, "windows-1254" },
0050     { CP1255Encoding, "windows-1255" },
0051     { CP1256Encoding, "windows-1256" },
0052     { CP1257Encoding, "windows-1257" },
0053     { CP1258Encoding, "windows-1258" },
0054     { IBM850Encoding, "IBM850" },
0055     { IBM866Encoding, "IBM866" },
0056 //     { IBM874Encoding, "IBM874" }, using our own IBM874CharCodec, see docs there
0057     { KOI8_REncoding, "KOI8-R" },
0058     { KOI8_UEncoding, "KOI8-U" }
0059 };
0060 // TODO: WS2
0061
0062 static bool is8Bit(QTextCodec* codec)
0063 {
0064     bool result = false;
0065
0066     const QByteArray& codecName = codec->name();
0067     for (auto& encodingData : encodingDataList) {
0068         if (qstrcmp(codecName, encodingData.name) == 0) {
0069             result = true;
0070             break;
0071         }
0072     }
0073
0074     return result;
0075 }
0076
0077 static QTextCodec* createLatin1()
0078 {
0079     // silence deprecation warning
0080     // porting away would need a too big rework for a bugfix branch
0081     QT_WARNING_PUSH
0082     QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations")
0083     QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations")
0084     return KCharsets::charsets()->codecForName(QLatin1String(encodingDataList[0].name));
0085     QT_WARNING_POP
0086 }
0087
0088 /* heuristic seems to be doomed :(
0089 static bool is8Bit( QTextCodec *Codec )
0090 {
0091     bool Result = true;
0092
0093     // first test different for 0
0094     unsigned char c[4];
0095     c[0] = 0;
0096     c[1] = c[2] = c[3] = 230;
0097     QString S = Codec->toUnicode( (const char*)&c,4 );
0098     int Length = 1;
0099     QCString CS = Codec->fromUnicode( S, Length );
0100     //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<Length ;
0101     if( Length > 0 )
0102         Result = false;
0103     // test if all chars survive the recoding
0104     else
0105     do {
0106         ++c[0];
0107         S = Codec->toUnicode( (const char*)&c,4 );
0108         Length = 1;
0109         CS = Codec->fromUnicode( S, Length );
0110         //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<c[0]<<"->"<<CS[0]<<":"<<Length ;
0111         if( Length != 1 || (CS[0] != (char)c[0] && CS[0] != QTextCodecWhiteSpace) ) {
0112             Result = false;
0113             break;
0114         }
0115     } while( c[0] < 255 );
0116     return Result;
0117 }
0118 const QStringList &TextCharCodec::codecNames()
0119     {
0120     // first call?
0121     if( CodecNames.isEmpty() ) {
0122         const QStringList &CharSets = KCharsets::charsets()->availableEncodingNames();
0123
0124         for( QStringList::ConstIterator it = CharSets.begin(); it != CharSets.end(); ++it ) {
0125             bool Found = true;
0126             QTextCodec* Codec = KCharsets::charsets()->codecForName( *it, Found );
0127             if( Found && is8Bit(Codec) )
0128                 CodecNames.append( QString::fromLatin1(Codec->name()) );
0129         }
0130     }
0131
0132     return CodecNames;
0133 }
0134
0135 QString TextCharCodec::nameOfEncoding( CharCoding _char )
0136 {
0137     TextCharCodec *Codec = 0;
0138
0139     const char* N = 0;
0140     for( unsigned int i=0; i<NoOfEncodings; ++i )
0141     {
0142         if( EncodingNames[i].Encoding == _char )
0143         {
0144             N = EncodingNames[i].Name;
0145             break;
0146         }
0147     }
0148
0149     if( N != 0 )
0150     {
0151         QString CodeName = QString::fromLatin1( N );
0152     }
0153     return Codec;
0154 }
0155  */
0156
0157 TextCharCodec* TextCharCodec::createLocalCodec()
0158 {
0159     QTextCodec* codec = QTextCodec::codecForLocale();
0160     if (!is8Bit(codec)) {
0161         codec = createLatin1();
0162     }
0163     return new TextCharCodec(codec);
0164 }
0165
0166 TextCharCodec* TextCharCodec::createCodec(const QString& codecName)
0167 {
0168     bool isOk = false;
0169     // silence deprecation warning
0170     // porting away would need a too big rework for a bugfix branch
0171     QT_WARNING_PUSH
0172     QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations")
0173     QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations")
0174     QTextCodec* codec = KCharsets::charsets()->codecForName(codecName, isOk);
0175     QT_WARNING_POP
0176     if (isOk) {
0177         isOk = is8Bit(codec);
0178     }
0179     return isOk ? new TextCharCodec(codec) : nullptr;
0180 }
0181
0182 const QStringList& TextCharCodec::codecNames()
0183 {
0184     static QStringList textCodecNames;
0185
0186     // first call?
0187     if (textCodecNames.isEmpty()) {
0188         KCharsets* charsets = KCharsets::charsets();
0189         for (auto& encodingData : encodingDataList) {
0190             bool isCodecFound = false;
0191             const QString codecName = QString::fromLatin1(encodingData.name);
0192             // silence deprecation warning
0193             // porting away would need a too big rework for a bugfix branch
0194             QT_WARNING_PUSH
0195             QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations")
0196             QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations")
0197             QTextCodec* codec = charsets->codecForName(codecName, isCodecFound);
0198             QT_WARNING_POP
0199             if (isCodecFound) {
0200                 textCodecNames.append(QString::fromLatin1(codec->name()));
0201             }
0202         }
0203     }
0204
0205     return textCodecNames;
0206 }
0207
0208 TextCharCodec::TextCharCodec(QTextCodec* textCodec)
0209     : mCodec(textCodec)
0210     , mDecoder(textCodec->makeDecoder())
0211     , mEncoder(textCodec->makeEncoder())
0212 {
0213 }
0214
0215 TextCharCodec::~TextCharCodec()
0216 {
0217     delete mDecoder;
0218     delete mEncoder;
0219 }
0220
0221 bool TextCharCodec::canEncode(const QChar& _char) const
0222 {
0223     return mCodec->canEncode(_char);
0224 }
0225
0226 bool TextCharCodec::encode(Byte* byte, const QChar& _char) const
0227 {
0228     if (!mCodec->canEncode(_char)) { // TODO: do we really need the codec?
0229         return false;
0230     }
0231
0232     const QByteArray encoded = mEncoder->fromUnicode(QString(_char));
0233     if (encoded.size() > 0) {
0234         *byte = encoded.at(0);
0235         return true;
0236     }
0237
0238     return false;
0239 }
0240
0241 Character TextCharCodec::decode(Byte byte) const
0242 {
0243     // QTextCodecs "use this codepoint when input data cannot be represented in Unicode." (Qt docs)
0244     constexpr QChar replacementChar = QChar(QChar::ReplacementCharacter);
0245     const QString string =
0246         mDecoder->toUnicode(reinterpret_cast<const char*>(&byte), 1);
0247     const QChar qchar = string.at(0);
0248     const bool isDecoded = (qchar != replacementChar);
0249     return {qchar, !isDecoded};
0250 }
0251
0252 const QString& TextCharCodec::name() const
0253 {
0254     if (mName.isNull()) {
0255         mName = QString::fromLatin1(mCodec->name());
0256     }
0257
0258     return mName;
0259 }
0260
0261 }