File indexing completed on 2024-05-19 05:56:03
0001 /* 0002 This file is part of the Okteta Core library, made within the KDE community. 0003 0004 SPDX-FileCopyrightText: 2004, 2011 Friedrich W. H. Kossebau <kossebau@kde.org> 0005 0006 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL 0007 */ 0008 0009 #include "textcharcodec.hpp" 0010 0011 // lib 0012 #include <character.hpp> 0013 #include <logging.hpp> 0014 // Qt 0015 #include <QTextCodec> 0016 0017 namespace Okteta { 0018 0019 // static const char QTextCodecWhiteSpace = 63; 0020 0021 static constexpr struct EncodingData 0022 { 0023 CharCoding encodingId; 0024 const char* name; 0025 } 0026 encodingDataList[] = 0027 { 0028 { ISO8859_1Encoding, "ISO-8859-1" }, 0029 { ISO8859_2Encoding, "ISO-8859-2" }, 0030 { ISO8859_3Encoding, "ISO-8859-3" }, 0031 { ISO8859_4Encoding, "ISO-8859-4" }, 0032 { ISO8859_5Encoding, "ISO-8859-5" }, 0033 { ISO8859_6Encoding, "ISO-8859-6" }, 0034 { ISO8859_7Encoding, "ISO-8859-7" }, 0035 { ISO8859_8Encoding, "ISO-8859-8" }, 0036 // { ISO8859_8_IEncoding, "ISO-8859-8-I" }, Qt (at least 5.15) delivers this variant already for ISO-8859-8 0037 { ISO8859_9Encoding, "ISO-8859-9" }, 0038 { ISO8859_10Encoding, "ISO-8859-10" }, 0039 { ISO8859_11Encoding, "TIS-620" }, // was: ISO-8859-11 0040 { ISO8859_13Encoding, "ISO-8859-13" }, 0041 { ISO8859_14Encoding, "ISO-8859-14" }, 0042 { ISO8859_15Encoding, "ISO-8859-15" }, 0043 { ISO8859_16Encoding, "ISO-8859-16" }, 0044 { CP1250Encoding, "windows-1250" }, 0045 { CP1251Encoding, "windows-1251" }, 0046 { CP1252Encoding, "windows-1252" }, 0047 { CP1253Encoding, "windows-1253" }, 0048 { CP1254Encoding, "windows-1254" }, 0049 { CP1255Encoding, "windows-1255" }, 0050 { CP1256Encoding, "windows-1256" }, 0051 { CP1257Encoding, "windows-1257" }, 0052 { CP1258Encoding, "windows-1258" }, 0053 { IBM850Encoding, "IBM850" }, 0054 { IBM866Encoding, "IBM866" }, 0055 // { IBM874Encoding, "IBM874" }, using our own IBM874CharCodec, see docs there 0056 { KOI8_REncoding, "KOI8-R" }, 0057 { KOI8_UEncoding, "KOI8-U" } 0058 }; 0059 // TODO: WS2 0060 0061 static bool is8Bit(QTextCodec* codec) 0062 { 0063 bool result = false; 0064 0065 const QByteArray& codecName = codec->name(); 0066 for (const auto& encodingData : encodingDataList) { 0067 if (codecName.compare(encodingData.name) == 0) { 0068 result = true; 0069 break; 0070 } 0071 } 0072 0073 return result; 0074 } 0075 0076 static QTextCodec* createLatin1() 0077 { 0078 return QTextCodec::codecForName(encodingDataList[0].name); 0079 } 0080 0081 /* heuristic seems to be doomed :( 0082 static bool is8Bit( QTextCodec *Codec ) 0083 { 0084 bool Result = true; 0085 0086 // first test different for 0 0087 unsigned char c[4]; 0088 c[0] = 0; 0089 c[1] = c[2] = c[3] = 230; 0090 QString S = Codec->toUnicode( (const char*)&c,4 ); 0091 int Length = 1; 0092 QCString CS = Codec->fromUnicode( S, Length ); 0093 //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<Length ; 0094 if( Length > 0 ) 0095 Result = false; 0096 // test if all chars survive the recoding 0097 else 0098 do { 0099 ++c[0]; 0100 S = Codec->toUnicode( (const char*)&c,4 ); 0101 Length = 1; 0102 CS = Codec->fromUnicode( S, Length ); 0103 //qCDebug(LOG_OKTETA_CORE) << Codec->name() << " "<<c[0]<<"->"<<CS[0]<<":"<<Length ; 0104 if( Length != 1 || (CS[0] != (char)c[0] && CS[0] != QTextCodecWhiteSpace) ) { 0105 Result = false; 0106 break; 0107 } 0108 } while( c[0] < 255 ); 0109 return Result; 0110 } 0111 const QStringList &TextCharCodec::codecNames() 0112 { 0113 // first call? 0114 if( CodecNames.isEmpty() ) { 0115 const QStringList &CharSets = KCharsets::charsets()->availableEncodingNames(); 0116 0117 for( QStringList::ConstIterator it = CharSets.begin(); it != CharSets.end(); ++it ) { 0118 bool Found = true; 0119 QTextCodec* Codec = KCharsets::charsets()->codecForName( *it, Found ); 0120 if( Found && is8Bit(Codec) ) 0121 CodecNames.append( QString::fromLatin1(Codec->name()) ); 0122 } 0123 } 0124 0125 return CodecNames; 0126 } 0127 0128 QString TextCharCodec::nameOfEncoding( CharCoding _char ) 0129 { 0130 TextCharCodec *Codec = 0; 0131 0132 const char* N = 0; 0133 for( unsigned int i=0; i<NoOfEncodings; ++i ) 0134 { 0135 if( EncodingNames[i].Encoding == _char ) 0136 { 0137 N = EncodingNames[i].Name; 0138 break; 0139 } 0140 } 0141 0142 if( N != 0 ) 0143 { 0144 QString CodeName = QString::fromLatin1( N ); 0145 } 0146 return Codec; 0147 } 0148 */ 0149 0150 TextCharCodec* TextCharCodec::createLocalCodec() 0151 { 0152 QTextCodec* codec = QTextCodec::codecForLocale(); 0153 if (!is8Bit(codec)) { 0154 codec = createLatin1(); 0155 } 0156 return new TextCharCodec(codec); 0157 } 0158 0159 TextCharCodec* TextCharCodec::createCodec(const QString& codecName) 0160 { 0161 bool isOk = false; 0162 QTextCodec* codec = QTextCodec::codecForName(codecName.toLatin1()); 0163 if (codec) { 0164 isOk = is8Bit(codec); 0165 } 0166 return isOk ? new TextCharCodec(codec) : nullptr; 0167 } 0168 0169 const QStringList& TextCharCodec::codecNames() 0170 { 0171 static QStringList textCodecNames; 0172 0173 // first call? 0174 if (textCodecNames.isEmpty()) { 0175 for (const auto& encodingData : encodingDataList) { 0176 QTextCodec* codec = QTextCodec::codecForName((encodingData.name)); 0177 if (codec) { 0178 textCodecNames.append(QString::fromLatin1(codec->name())); 0179 } 0180 } 0181 } 0182 0183 return textCodecNames; 0184 } 0185 0186 TextCharCodec::TextCharCodec(QTextCodec* textCodec) 0187 : mCodec(textCodec) 0188 , mDecoder(textCodec->makeDecoder()) 0189 , mEncoder(textCodec->makeEncoder()) 0190 { 0191 } 0192 0193 TextCharCodec::~TextCharCodec() 0194 { 0195 delete mDecoder; 0196 delete mEncoder; 0197 } 0198 0199 bool TextCharCodec::canEncode(QChar _char) const 0200 { 0201 return mCodec->canEncode(_char); 0202 } 0203 0204 bool TextCharCodec::encode(Byte* byte, QChar _char) const 0205 { 0206 if (!mCodec->canEncode(_char)) { // TODO: do we really need the codec? 0207 return false; 0208 } 0209 0210 const QByteArray encoded = mEncoder->fromUnicode(QString(_char)); 0211 if (encoded.size() > 0) { 0212 *byte = encoded.at(0); 0213 return true; 0214 } 0215 0216 return false; 0217 } 0218 0219 Character TextCharCodec::decode(Byte byte) const 0220 { 0221 // QTextCodecs "use this codepoint when input data cannot be represented in Unicode." (Qt docs) 0222 constexpr QChar replacementChar = QChar(QChar::ReplacementCharacter); 0223 const QString string = 0224 mDecoder->toUnicode(reinterpret_cast<const char*>(&byte), 1); 0225 const QChar qchar = string.at(0); 0226 const bool isDecoded = (qchar != replacementChar); 0227 return {qchar, !isDecoded}; 0228 } 0229 0230 QString TextCharCodec::name() const 0231 { 0232 if (mName.isNull()) { 0233 mName = QString::fromLatin1(mCodec->name()); 0234 } 0235 0236 return mName; 0237 } 0238 0239 }