File indexing completed on 2024-09-01 13:20:52
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org> 0005 SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org> 0006 SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net> 0007 0008 SPDX-License-Identifier: LGPL-2.0-or-later 0009 */ 0010 #include "kcharsets.h" 0011 #include "kcharsets_p.h" 0012 #include "kcodecs_debug.h" 0013 0014 #include "kusasciitextcodec.h" 0015 #include <kentities.h> 0016 0017 #include <QHash> 0018 #include <QTextCodec> 0019 0020 #include <algorithm> 0021 #include <assert.h> 0022 0023 /* 0024 * The encoding names (like "ISO 8859-1") in this list are user-visible, 0025 * and should be mostly uppercase. 0026 * Generate with generate_string_table.pl (located in kde-dev-scripts), 0027 * input data: 0028 ISO 8859-1 0029 i18n:Western European 0030 ISO 8859-15 0031 i18n:Western European 0032 ISO 8859-14 0033 i18n:Western European 0034 cp 1252 0035 i18n:Western European 0036 IBM850 0037 i18n:Western European 0038 ISO 8859-2 0039 i18n:Central European 0040 ISO 8859-3 0041 i18n:Central European 0042 ISO 8859-4 0043 i18n:Baltic 0044 ISO 8859-13 0045 i18n:Baltic 0046 ISO 8859-16 0047 i18n:South-Eastern Europe 0048 cp 1250 0049 i18n:Central European 0050 cp 1254 0051 i18n:Turkish 0052 cp 1257 0053 i18n:Baltic 0054 KOI8-R 0055 i18n:Cyrillic 0056 ISO 8859-5 0057 i18n:Cyrillic 0058 cp 1251 0059 i18n:Cyrillic 0060 KOI8-U 0061 i18n:Cyrillic 0062 IBM866 0063 i18n:Cyrillic 0064 Big5 0065 i18n:Chinese Traditional 0066 Big5-HKSCS 0067 i18n:Chinese Traditional 0068 GB18030 0069 i18n:Chinese Simplified 0070 GBK 0071 i18n:Chinese Simplified 0072 GB2312 0073 i18n:Chinese Simplified 0074 EUC-KR 0075 i18n:Korean 0076 windows-949 0077 i18n:Korean 0078 sjis 0079 i18n:Japanese 0080 ISO-2022-JP 0081 i18n:Japanese 0082 EUC-JP 0083 i18n:Japanese 0084 ISO 8859-7 0085 i18n:Greek 0086 cp 1253 0087 i18n:Greek 0088 ISO 8859-6 0089 i18n:Arabic 0090 cp 1256 0091 i18n:Arabic 0092 ISO 8859-8 0093 i18n:Hebrew 0094 ISO 8859-8-I 0095 i18n:Hebrew 0096 cp 1255 0097 i18n:Hebrew 0098 ISO 8859-9 0099 i18n:Turkish 0100 TIS620 0101 i18n:Thai 0102 ISO 8859-11 0103 i18n:Thai 0104 UTF-8 0105 i18n:Unicode 0106 UTF-16 0107 i18n:Unicode 0108 utf7 0109 i18n:Unicode 0110 ucs2 0111 i18n:Unicode 0112 ISO 10646-UCS-2 0113 i18n:Unicode 0114 windows-1258 0115 i18n:Other 0116 IBM874 0117 i18n:Other 0118 TSCII 0119 i18n:Other 0120 */ 0121 /* 0122 * Notes about the table: 0123 * 0124 * - The following entries were disabled and removed from the table: 0125 ibm852 0126 i18n:Central European 0127 pt 154 0128 i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt 0129 * 0130 * - ISO 8559-11 is the deprecated name of TIS-620 0131 * - utf7 is not in Qt 0132 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 0133 * - windows-1258: TODO 0134 * - IBM874: TODO 0135 * - TSCII: TODO 0136 */ 0137 0138 /* 0139 * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that 0140 * statically initialised text should be translated so that it expands to just 0141 * the string that should be translated, making it possible to use it in the 0142 * single string construct below. 0143 */ 0144 #undef QT_TRANSLATE_NOOP3 0145 #define QT_TRANSLATE_NOOP3(a, b, c) b 0146 0147 /* 0148 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. 0149 * The script used was generate_string_table.pl which can be found in kde-dev-scripts. 0150 * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP. 0151 */ 0152 0153 static const char language_for_encoding_string[] = 0154 "ISO 8859-1\0" 0155 QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0" 0156 "ISO 8859-15\0" 0157 "ISO 8859-14\0" 0158 "cp 1252\0" 0159 "IBM850\0" 0160 "ISO 8859-2\0" 0161 QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0" 0162 "ISO 8859-3\0" 0163 "ISO 8859-4\0" 0164 QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0" 0165 "ISO 8859-13\0" 0166 "ISO 8859-16\0" 0167 QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0" 0168 "cp 1250\0" 0169 "cp 1254\0" 0170 QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0" 0171 "cp 1257\0" 0172 "KOI8-R\0" 0173 QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0" 0174 "ISO 8859-5\0" 0175 "cp 1251\0" 0176 "KOI8-U\0" 0177 "IBM866\0" 0178 "Big5\0" 0179 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0" 0180 "Big5-HKSCS\0" 0181 "GB18030\0" 0182 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0" 0183 "GBK\0" 0184 "GB2312\0" 0185 "EUC-KR\0" 0186 QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0" 0187 "windows-949\0" 0188 "sjis\0" 0189 QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0" 0190 "ISO-2022-JP\0" 0191 "EUC-JP\0" 0192 "ISO 8859-7\0" 0193 QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0" 0194 "cp 1253\0" 0195 "ISO 8859-6\0" 0196 QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0" 0197 "cp 1256\0" 0198 "ISO 8859-8\0" 0199 QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0" 0200 "ISO 8859-8-I\0" 0201 "cp 1255\0" 0202 "ISO 8859-9\0" 0203 "TIS620\0" 0204 QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0" 0205 "ISO 8859-11\0" 0206 "UTF-8\0" 0207 QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0" 0208 "UTF-16\0" 0209 "utf7\0" 0210 "ucs2\0" 0211 "ISO 10646-UCS-2\0" 0212 "windows-1258\0" 0213 QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0" 0214 "IBM874\0" 0215 "TSCII\0" 0216 "\0"; 0217 0218 static const int language_for_encoding_indices[] = { 0219 0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228, 0220 208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419, 0221 426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1}; 0222 0223 /* 0224 * GENERATED CODE ENDS HERE 0225 */ 0226 0227 /* 0228 * defines some different names for codecs that are built into Qt. 0229 * The names in this list must be lower-case. 0230 * input data for generate_string_table.pl: 0231 iso-ir-111 0232 koi8-r 0233 koi unified 0234 koi8-r 0235 us-ascii 0236 iso 8859-1 0237 usascii 0238 iso 8859-1 0239 ascii 0240 iso 8859-1 0241 unicode-1-1-utf-7 0242 utf-7 0243 ucs2 0244 iso-10646-ucs-2 0245 iso10646-1 0246 iso-10646-ucs-2 0247 gb18030.2000-1 0248 gb18030 0249 gb18030.2000-0 0250 gb18030 0251 gbk-0 0252 gbk 0253 gb2312 0254 gbk 0255 gb2312.1980-0 0256 gbk 0257 big5-0 0258 big5 0259 euc-kr 0260 euckr 0261 cp 949 0262 windows-949 0263 euc-jp 0264 eucjp 0265 jisx0201.1976-0 0266 eucjp 0267 jisx0208.1983-0 0268 eucjp 0269 jisx0208.1990-0 0270 eucjp 0271 jisx0208.1997-0 0272 eucjp 0273 jisx0212.1990-0 0274 eucjp 0275 jisx0213.2000-1 0276 eucjp 0277 jisx0213.2000-2 0278 eucjp 0279 shift_jis 0280 sjis 0281 shift-jis 0282 sjis 0283 sjis 0284 sjis 0285 iso-2022-jp 0286 jis7 0287 windows850 0288 ibm850 0289 windows866 0290 ibm866 0291 windows-850 0292 ibm850 0293 windows-866 0294 ibm866 0295 cp-10000 0296 apple roman 0297 thai-tis620 0298 iso 8859-11 0299 windows-874 0300 ibm874 0301 windows874 0302 ibm874 0303 cp-874 0304 ibm874 0305 ksc5601.1987-0 0306 euckr 0307 ks_c_5601-1987 0308 euckr 0309 mac-roman 0310 apple roman 0311 macintosh 0312 apple roman 0313 mac 0314 apple roman 0315 csiso2022jp 0316 iso-2022-jp 0317 */ 0318 /* 0319 * Notes about the table: 0320 * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set) 0321 * - utf7 is not in Qt 0322 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 0323 * - sjis: appears on the table for x-sjis 0324 * - jis7: ISO-2022-JP is now the default name in Qt4 0325 * - cp-874: is it really needed? 0326 * - mac-roman: appears on the table for x-mac-roman 0327 * - csiso2022jp: See bug #77243 0328 */ 0329 0330 /* 0331 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. 0332 * The script used was generate_string_table.pl which can be found in kde-dev-scripts. 0333 */ 0334 0335 static const char builtin_string[] = 0336 "iso-ir-111\0" 0337 "koi8-r\0" 0338 "koi unified\0" 0339 "us-ascii\0" 0340 "iso 8859-1\0" 0341 "usascii\0" 0342 "ascii\0" 0343 "unicode-1-1-utf-7\0" 0344 "utf-7\0" 0345 "ucs2\0" 0346 "iso-10646-ucs-2\0" 0347 "iso10646-1\0" 0348 "gb18030.2000-1\0" 0349 "gb18030\0" 0350 "gb18030.2000-0\0" 0351 "gbk-0\0" 0352 "gbk\0" 0353 "gb2312\0" 0354 "gb2312.1980-0\0" 0355 "big5-0\0" 0356 "big5\0" 0357 "euc-kr\0" 0358 "euckr\0" 0359 "cp 949\0" 0360 "windows-949\0" 0361 "euc-jp\0" 0362 "eucjp\0" 0363 "jisx0201.1976-0\0" 0364 "jisx0208.1983-0\0" 0365 "jisx0208.1990-0\0" 0366 "jisx0208.1997-0\0" 0367 "jisx0212.1990-0\0" 0368 "jisx0213.2000-1\0" 0369 "jisx0213.2000-2\0" 0370 "shift_jis\0" 0371 "sjis\0" 0372 "shift-jis\0" 0373 "iso-2022-jp\0" 0374 "jis7\0" 0375 "windows850\0" 0376 "ibm850\0" 0377 "windows866\0" 0378 "ibm866\0" 0379 "windows-850\0" 0380 "windows-866\0" 0381 "cp-10000\0" 0382 "apple roman\0" 0383 "thai-tis620\0" 0384 "iso 8859-11\0" 0385 "windows-874\0" 0386 "ibm874\0" 0387 "windows874\0" 0388 "cp-874\0" 0389 "ksc5601.1987-0\0" 0390 "ks_c_5601-1987\0" 0391 "mac-roman\0" 0392 "macintosh\0" 0393 "mac\0" 0394 "csiso2022jp\0" 0395 "\0"; 0396 0397 static const int builtin_indices[] = {0, 11, 18, 11, 30, 39, 50, 39, 58, 39, 64, 82, 88, 93, 109, 93, 120, 135, 143, 135, 158, 164, 0398 168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240, 0399 326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469, 0400 481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1}; 0401 0402 /* 0403 * GENERATED CODE ENDS HERE 0404 */ 0405 0406 /* 0407 * some last resort hints in case the charmap file couldn't be found. 0408 * This gives at least a partial conversion and helps making things readable. 0409 * 0410 * the name used as input here is already converted to the more canonical 0411 * name as defined in the aliases array. 0412 * 0413 * Input data: 0414 cp1250 0415 iso-8859-2 0416 koi8-r 0417 iso-8859-5 0418 koi8-u 0419 koi8-r 0420 pt 154 0421 windows-1251 0422 paratype-154 0423 windows-1251 0424 pt-154 0425 windows-1251 0426 */ 0427 /* Notes: 0428 * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback. 0429 */ 0430 0431 /* 0432 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. 0433 * The script used was generate_string_table.pl which can be found in kde-dev-scripts. 0434 */ 0435 0436 static const char conversion_hints_string[] = 0437 "cp1250\0" 0438 "iso-8859-2\0" 0439 "koi8-r\0" 0440 "iso-8859-5\0" 0441 "koi8-u\0" 0442 "pt 154\0" 0443 "windows-1251\0" 0444 "paratype-154\0" 0445 "pt-154\0" 0446 "\0"; 0447 0448 static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1}; 0449 0450 /* 0451 * GENERATED CODE ENDS HERE 0452 */ 0453 0454 struct KCharsetsSingletonPrivate { 0455 KCharsets instance; 0456 }; 0457 0458 Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets) 0459 0460 // search an array of items index/data, find first matching index 0461 // and return data, or return 0 0462 static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry) 0463 { 0464 for (int i = 0; indices[i] != -1; i += 2) { 0465 if (qstrcmp(start + indices[i], entry) == 0) { 0466 return start + indices[i + 1]; 0467 } 0468 } 0469 return nullptr; 0470 } 0471 0472 bool KCharsetsPrivate::isUsAsciiTextCodecRequest(const QByteArray &name) const 0473 { 0474 if (usAsciiTextCodec->name().compare(name, Qt::CaseInsensitive) == 0) { 0475 return true; 0476 } 0477 const QList<QByteArray> aliases = usAsciiTextCodec->aliases(); 0478 return std::any_of(aliases.constBegin(), aliases.constEnd(), [name](const QByteArray &aliasName) { 0479 return (aliasName.compare(name, Qt::CaseInsensitive) == 0); 0480 }); 0481 } 0482 0483 // -------------------------------------------------------------------------- 0484 0485 KCharsets::KCharsets() 0486 : d(new KCharsetsPrivate(this)) 0487 { 0488 } 0489 0490 KCharsets::~KCharsets() = default; 0491 0492 QChar KCharsets::fromEntity(const QString &str) 0493 { 0494 QChar res = QChar::Null; 0495 0496 if (str.isEmpty()) { 0497 return QChar::Null; 0498 } 0499 0500 int pos = 0; 0501 if (str[pos] == QLatin1Char('&')) { 0502 pos++; 0503 } 0504 0505 // Check for '�' or '�' sequence 0506 if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) { 0507 bool ok; 0508 pos++; 0509 if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) { 0510 pos++; 0511 // '�', hexadecimal character reference 0512 const QString tmp(str.mid(pos)); 0513 res = QChar(tmp.toInt(&ok, 16)); 0514 } else { 0515 // '�', decimal character reference 0516 const QString tmp(str.mid(pos)); 0517 res = QChar(tmp.toInt(&ok, 10)); 0518 } 0519 if (ok) { 0520 return res; 0521 } else { 0522 return QChar::Null; 0523 } 0524 } 0525 0526 const QByteArray raw(str.toLatin1()); 0527 const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length()); 0528 0529 if (!e) { 0530 // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length(); 0531 return QChar::Null; 0532 } 0533 // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code; 0534 0535 return QChar(e->code); 0536 } 0537 0538 QChar KCharsets::fromEntity(const QString &str, int &len) 0539 { 0540 // entities are never longer than 8 chars... we start from 0541 // that length and work backwards... 0542 len = 8; 0543 while (len > 0) { 0544 QString tmp = str.left(len); 0545 QChar res = fromEntity(tmp); 0546 if (res != QChar::Null) { 0547 return res; 0548 } 0549 len--; 0550 } 0551 return QChar::Null; 0552 } 0553 0554 QString KCharsets::toEntity(const QChar &ch) 0555 { 0556 return QString::asprintf("�x%x;", ch.unicode()); 0557 } 0558 0559 QString KCharsets::resolveEntities(const QString &input) 0560 { 0561 QString text = input; 0562 const QChar *p = text.unicode(); 0563 const QChar *end = p + text.length(); 0564 const QChar *ampersand = nullptr; 0565 bool scanForSemicolon = false; 0566 0567 for (; p < end; ++p) { 0568 const QChar ch = *p; 0569 0570 if (ch == QLatin1Char('&')) { 0571 ampersand = p; 0572 scanForSemicolon = true; 0573 continue; 0574 } 0575 0576 if (ch != QLatin1Char(';') || scanForSemicolon == false) { 0577 continue; 0578 } 0579 0580 assert(ampersand); 0581 0582 scanForSemicolon = false; 0583 0584 const QChar *entityBegin = ampersand + 1; 0585 0586 const uint entityLength = p - entityBegin; 0587 if (entityLength == 0) { 0588 continue; 0589 } 0590 0591 const QChar entityValue = KCharsets::fromEntity(QString(entityBegin, entityLength)); 0592 if (entityValue.isNull()) { 0593 continue; 0594 } 0595 0596 const uint ampersandPos = ampersand - text.unicode(); 0597 0598 text[(int)ampersandPos] = entityValue; 0599 text.remove(ampersandPos + 1, entityLength + 1); 0600 p = text.unicode() + ampersandPos; 0601 end = text.unicode() + text.length(); 0602 ampersand = nullptr; 0603 } 0604 0605 return text; 0606 } 0607 0608 QStringList KCharsets::availableEncodingNames() const 0609 { 0610 QStringList available; 0611 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { 0612 available.append(QString::fromUtf8(language_for_encoding_string + *p)); 0613 } 0614 available.sort(); 0615 return available; 0616 } 0617 0618 QString KCharsets::descriptionForEncoding(const QString &encoding) const 0619 { 0620 const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data()); 0621 if (lang) { 0622 return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding); 0623 } else { 0624 return tr("Other encoding (%1)", "@item").arg(encoding); 0625 } 0626 } 0627 0628 QString KCharsets::encodingForName(const QString &descriptiveName) const 0629 { 0630 const int left = descriptiveName.lastIndexOf(QLatin1Char('(')); 0631 0632 if (left < 0) { // No parenthesis, so assume it is a normal encoding name 0633 return descriptiveName.trimmed(); 0634 } 0635 0636 QString name(descriptiveName.mid(left + 1)); 0637 0638 const int right = name.lastIndexOf(QLatin1Char(')')); 0639 0640 if (right < 0) { 0641 return name; 0642 } 0643 0644 return name.left(right).trimmed(); 0645 } 0646 0647 QStringList KCharsets::descriptiveEncodingNames() const 0648 { 0649 QStringList encodings; 0650 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { 0651 const QString name = QString::fromUtf8(language_for_encoding_string + p[0]); 0652 const QString description = tr(language_for_encoding_string + p[1], "@item Text character set"); 0653 encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name)); 0654 } 0655 encodings.sort(); 0656 return encodings; 0657 } 0658 0659 QList<QStringList> KCharsets::encodingsByScript() const 0660 { 0661 if (!d->encodingsByScript.isEmpty()) { 0662 return d->encodingsByScript; 0663 } 0664 int i; 0665 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { 0666 const QString name = QString::fromUtf8(language_for_encoding_string + p[0]); 0667 const QString description = tr(language_for_encoding_string + p[1], "@item Text character set"); 0668 0669 for (i = 0; i < d->encodingsByScript.size(); ++i) { 0670 if (d->encodingsByScript.at(i).at(0) == description) { 0671 d->encodingsByScript[i].append(name); 0672 break; 0673 } 0674 } 0675 0676 if (i == d->encodingsByScript.size()) { 0677 d->encodingsByScript.append(QStringList() << description << name); 0678 } 0679 } 0680 return d->encodingsByScript; 0681 } 0682 0683 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101) 0684 QTextCodec *KCharsets::codecForName(const QString &n) const 0685 { 0686 return d->codecForName(n); 0687 } 0688 #endif 0689 0690 QTextCodec *KCharsetsPrivate::codecForName(const QString &n) 0691 { 0692 if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) { 0693 return QTextCodec::codecForName("gb18030"); 0694 } 0695 const QByteArray name(n.toLatin1()); 0696 QTextCodec *codec = codecForNameOrNull(name); 0697 if (codec) { 0698 return codec; 0699 } else { 0700 return QTextCodec::codecForName("iso-8859-1"); 0701 } 0702 } 0703 0704 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101) 0705 QTextCodec *KCharsets::codecForName(const QString &n, bool &ok) const 0706 { 0707 return d->codecForName(n, ok); 0708 }; 0709 #endif 0710 0711 QTextCodec *KCharsetsPrivate::codecForName(const QString &n, bool &ok) 0712 { 0713 if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) { 0714 ok = true; 0715 return QTextCodec::codecForName("gb18030"); 0716 } 0717 const QByteArray name(n.toLatin1()); 0718 QTextCodec *codec = codecForNameOrNull(name); 0719 if (codec) { 0720 ok = true; 0721 return codec; 0722 } else { 0723 ok = false; 0724 return QTextCodec::codecForName("iso-8859-1"); 0725 } 0726 } 0727 0728 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101) 0729 QTextCodec *KCharsets::codecForNameOrNull(const QByteArray &n) const 0730 { 0731 return d->codecForNameOrNull(n); 0732 } 0733 #endif 0734 0735 QTextCodec *KCharsetsPrivate::codecForNameOrNull(const QByteArray &n) 0736 { 0737 QTextCodec *codec = nullptr; 0738 0739 if (n.isEmpty()) { 0740 // TODO: Any better ideas ? 0741 // No name, assume system locale 0742 const QByteArray locale = "->locale<-"; 0743 if (codecForNameDict.contains(locale)) { 0744 return codecForNameDict.value(locale); 0745 } 0746 codec = QTextCodec::codecForLocale(); 0747 codecForNameDict.insert("->locale<-", codec); 0748 return codec; 0749 } 0750 // For a non-empty name, lookup the "dictionary", in a case-sensitive way. 0751 else if (codecForNameDict.contains(n)) { 0752 return codecForNameDict.value(n); 0753 } 0754 0755 // If the name is not in the hash table, 0756 // first check ourselves if our fixed variant of a US-ASCII codec should be returned: 0757 // API docs of QTextCodec do not specify the handling of custom codec instances 0758 // on look-up when there are multiple codecs supporting the same name. 0759 // The code of Qt 5.15 prepends custom instances to the internal list, 0760 // so they would be preferred initially. 0761 // But the code also has a look-up cache which does not get updated on new instances, 0762 // so if somewhere a US-ASCII codec was requested by some other code before 0763 // our KUsAsciiTextCodec instance gets created, the Qt-built-in will be always 0764 // picked instead, at least for the used name. 0765 // So we cannot rely on the internal mechanisms, but have to prefer our codec ourselves. 0766 if (isUsAsciiTextCodecRequest(n)) { 0767 codec = usAsciiTextCodec; 0768 } else { 0769 // call directly QTextCodec::codecForName. 0770 // We assume that QTextCodec is smarter and more maintained than this code. 0771 codec = QTextCodec::codecForName(n); 0772 } 0773 0774 if (codec) { 0775 codecForNameDict.insert(n, codec); 0776 return codec; 0777 } 0778 0779 // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it. 0780 0781 QByteArray name = n.toLower(); 0782 bool changed = false; 0783 if (name.endsWith("_charset")) { // krazy:exclude=strings 0784 name.chop(8); 0785 changed = true; 0786 } 0787 if (name.startsWith("x-")) { // krazy:exclude=strings 0788 name.remove(0, 2); // remove x- at start 0789 changed = true; 0790 } 0791 0792 if (name.isEmpty()) { 0793 // We have no name anymore, therefore the name is invalid. 0794 return nullptr; 0795 } 0796 0797 // We only need to check changed names. 0798 if (changed) { 0799 codec = QTextCodec::codecForName(name); 0800 if (codec) { 0801 codecForNameDict.insert(n, codec); 0802 return codec; 0803 } 0804 } 0805 0806 // these codecs are built into Qt, but the name given for the codec is different, 0807 // so QTextCodec did not recognize it. 0808 QByteArray cname = kcharsets_array_search(builtin_string, builtin_indices, name.data()); 0809 0810 if (!cname.isEmpty()) { 0811 codec = QTextCodec::codecForName(cname); 0812 } 0813 0814 if (codec) { 0815 codecForNameDict.insert(n, codec); 0816 return codec; 0817 } 0818 0819 // this also failed, the last resort is now to take some compatibility charmap 0820 // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write. 0821 cname = kcharsets_array_search(conversion_hints_string, conversion_hints_indices, name.data()); 0822 0823 if (!cname.isEmpty()) { 0824 codec = QTextCodec::codecForName(cname); 0825 if (codec) { 0826 codecForNameDict.insert(n, codec); 0827 return codec; 0828 } 0829 } 0830 0831 // we could not assign a codec, therefore return NULL 0832 return nullptr; 0833 } 0834 0835 KCharsets *KCharsets::charsets() 0836 { 0837 return &globalCharsets()->instance; 0838 }