File indexing completed on 2024-05-05 17:57:59
0001 /* 0002 This file is part of the Okteta Core library, made within the KDE community. 0003 0004 SPDX-FileCopyrightText: 2004, 2011 Friedrich W. H. Kossebau <kossebau@kde.org> 0005 0006 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL 0007 */ 0008 0009 #include "textcharcodec.hpp" 0010 0011 // lib 0012 #include <character.hpp> 0013 // KF 0014 #include <KCharsets> 0015 // Qt 0016 #include <QTextCodec> 0017 0018 namespace Okteta { 0019 0020 // static const char QTextCodecWhiteSpace = 63; 0021 0022 static constexpr struct EncodingData 0023 { 0024 CharCoding encodingId; 0025 const char* name; 0026 } 0027 encodingDataList[] = 0028 { 0029 { ISO8859_1Encoding, "ISO-8859-1" }, 0030 { ISO8859_2Encoding, "ISO-8859-2" }, 0031 { ISO8859_3Encoding, "ISO-8859-3" }, 0032 { ISO8859_4Encoding, "ISO-8859-4" }, 0033 { ISO8859_5Encoding, "ISO-8859-5" }, 0034 { ISO8859_6Encoding, "ISO-8859-6" }, 0035 { ISO8859_7Encoding, "ISO-8859-7" }, 0036 { ISO8859_8Encoding, "ISO-8859-8" }, 0037 // { ISO8859_8_IEncoding, "ISO-8859-8-I" }, Qt (at least 5.15) delivers this variant already for ISO-8859-8 0038 { ISO8859_9Encoding, "ISO-8859-9" }, 0039 { ISO8859_10Encoding, "ISO-8859-10" }, 0040 { ISO8859_11Encoding, "TIS-620" }, // was: ISO-8859-11 0041 { ISO8859_13Encoding, "ISO-8859-13" }, 0042 { ISO8859_14Encoding, "ISO-8859-14" }, 0043 { ISO8859_15Encoding, "ISO-8859-15" }, 0044 { ISO8859_16Encoding, "ISO-8859-16" }, 0045 { CP1250Encoding, "windows-1250" }, 0046 { CP1251Encoding, "windows-1251" }, 0047 { CP1252Encoding, "windows-1252" }, 0048 { CP1253Encoding, "windows-1253" }, 0049 { CP1254Encoding, "windows-1254" }, 0050 { CP1255Encoding, "windows-1255" }, 0051 { CP1256Encoding, "windows-1256" }, 0052 { CP1257Encoding, "windows-1257" }, 0053 { CP1258Encoding, "windows-1258" }, 0054 { IBM850Encoding, "IBM850" }, 0055 { IBM866Encoding, "IBM866" }, 0056 // { IBM874Encoding, "IBM874" }, using our own IBM874CharCodec, see docs there 0057 { KOI8_REncoding, "KOI8-R" }, 0058 { KOI8_UEncoding, "KOI8-U" } 0059 }; 0060 // TODO: WS2 0061 0062 static bool is8Bit(QTextCodec* codec) 0063 { 0064 bool result = false; 0065 0066 const QByteArray& codecName = codec->name(); 0067 for (auto& encodingData : encodingDataList) { 0068 if (qstrcmp(codecName, encodingData.name) == 0) { 0069 result = true; 0070 break; 0071 } 0072 } 0073 0074 return result; 0075 } 0076 0077 static QTextCodec* createLatin1() 0078 { 0079 // silence deprecation warning 0080 // porting away would need a too big rework for a bugfix branch 0081 QT_WARNING_PUSH 0082 QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations") 0083 QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations") 0084 return KCharsets::charsets()->codecForName(QLatin1String(encodingDataList[0].name)); 0085 QT_WARNING_POP 0086 } 0087 0088 /* heuristic seems to be doomed :( 0089 static bool is8Bit( QTextCodec *Codec ) 0090 { 0091 bool Result = true; 0092 0093 // first test different for 0 0094 unsigned char c[4]; 0095 c[0] = 0; 0096 c[1] = c[2] = c[3] = 230; 0097 QString S = Codec->toUnicode( (const char*)&c,4 ); 0098 int Length = 1; 0099 QCString CS = Codec->fromUnicode( S, Length ); 0100 //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<Length ; 0101 if( Length > 0 ) 0102 Result = false; 0103 // test if all chars survive the recoding 0104 else 0105 do { 0106 ++c[0]; 0107 S = Codec->toUnicode( (const char*)&c,4 ); 0108 Length = 1; 0109 CS = Codec->fromUnicode( S, Length ); 0110 //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<c[0]<<"->"<<CS[0]<<":"<<Length ; 0111 if( Length != 1 || (CS[0] != (char)c[0] && CS[0] != QTextCodecWhiteSpace) ) { 0112 Result = false; 0113 break; 0114 } 0115 } while( c[0] < 255 ); 0116 return Result; 0117 } 0118 const QStringList &TextCharCodec::codecNames() 0119 { 0120 // first call? 0121 if( CodecNames.isEmpty() ) { 0122 const QStringList &CharSets = KCharsets::charsets()->availableEncodingNames(); 0123 0124 for( QStringList::ConstIterator it = CharSets.begin(); it != CharSets.end(); ++it ) { 0125 bool Found = true; 0126 QTextCodec* Codec = KCharsets::charsets()->codecForName( *it, Found ); 0127 if( Found && is8Bit(Codec) ) 0128 CodecNames.append( QString::fromLatin1(Codec->name()) ); 0129 } 0130 } 0131 0132 return CodecNames; 0133 } 0134 0135 QString TextCharCodec::nameOfEncoding( CharCoding _char ) 0136 { 0137 TextCharCodec *Codec = 0; 0138 0139 const char* N = 0; 0140 for( unsigned int i=0; i<NoOfEncodings; ++i ) 0141 { 0142 if( EncodingNames[i].Encoding == _char ) 0143 { 0144 N = EncodingNames[i].Name; 0145 break; 0146 } 0147 } 0148 0149 if( N != 0 ) 0150 { 0151 QString CodeName = QString::fromLatin1( N ); 0152 } 0153 return Codec; 0154 } 0155 */ 0156 0157 TextCharCodec* TextCharCodec::createLocalCodec() 0158 { 0159 QTextCodec* codec = QTextCodec::codecForLocale(); 0160 if (!is8Bit(codec)) { 0161 codec = createLatin1(); 0162 } 0163 return new TextCharCodec(codec); 0164 } 0165 0166 TextCharCodec* TextCharCodec::createCodec(const QString& codecName) 0167 { 0168 bool isOk = false; 0169 // silence deprecation warning 0170 // porting away would need a too big rework for a bugfix branch 0171 QT_WARNING_PUSH 0172 QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations") 0173 QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations") 0174 QTextCodec* codec = KCharsets::charsets()->codecForName(codecName, isOk); 0175 QT_WARNING_POP 0176 if (isOk) { 0177 isOk = is8Bit(codec); 0178 } 0179 return isOk ? new TextCharCodec(codec) : nullptr; 0180 } 0181 0182 const QStringList& TextCharCodec::codecNames() 0183 { 0184 static QStringList textCodecNames; 0185 0186 // first call? 0187 if (textCodecNames.isEmpty()) { 0188 KCharsets* charsets = KCharsets::charsets(); 0189 for (auto& encodingData : encodingDataList) { 0190 bool isCodecFound = false; 0191 const QString codecName = QString::fromLatin1(encodingData.name); 0192 // silence deprecation warning 0193 // porting away would need a too big rework for a bugfix branch 0194 QT_WARNING_PUSH 0195 QT_WARNING_DISABLE_CLANG("-Wdeprecated-declarations") 0196 QT_WARNING_DISABLE_GCC("-Wdeprecated-declarations") 0197 QTextCodec* codec = charsets->codecForName(codecName, isCodecFound); 0198 QT_WARNING_POP 0199 if (isCodecFound) { 0200 textCodecNames.append(QString::fromLatin1(codec->name())); 0201 } 0202 } 0203 } 0204 0205 return textCodecNames; 0206 } 0207 0208 TextCharCodec::TextCharCodec(QTextCodec* textCodec) 0209 : mCodec(textCodec) 0210 , mDecoder(textCodec->makeDecoder()) 0211 , mEncoder(textCodec->makeEncoder()) 0212 { 0213 } 0214 0215 TextCharCodec::~TextCharCodec() 0216 { 0217 delete mDecoder; 0218 delete mEncoder; 0219 } 0220 0221 bool TextCharCodec::canEncode(const QChar& _char) const 0222 { 0223 return mCodec->canEncode(_char); 0224 } 0225 0226 bool TextCharCodec::encode(Byte* byte, const QChar& _char) const 0227 { 0228 if (!mCodec->canEncode(_char)) { // TODO: do we really need the codec? 0229 return false; 0230 } 0231 0232 const QByteArray encoded = mEncoder->fromUnicode(QString(_char)); 0233 if (encoded.size() > 0) { 0234 *byte = encoded.at(0); 0235 return true; 0236 } 0237 0238 return false; 0239 } 0240 0241 Character TextCharCodec::decode(Byte byte) const 0242 { 0243 // QTextCodecs "use this codepoint when input data cannot be represented in Unicode." (Qt docs) 0244 constexpr QChar replacementChar = QChar(QChar::ReplacementCharacter); 0245 const QString string = 0246 mDecoder->toUnicode(reinterpret_cast<const char*>(&byte), 1); 0247 const QChar qchar = string.at(0); 0248 const bool isDecoded = (qchar != replacementChar); 0249 return {qchar, !isDecoded}; 0250 } 0251 0252 const QString& TextCharCodec::name() const 0253 { 0254 if (mName.isNull()) { 0255 mName = QString::fromLatin1(mCodec->name()); 0256 } 0257 0258 return mName; 0259 } 0260 0261 }