kcodecs/src/kcharsets.cpp

0001 /*
0002     This file is part of the KDE libraries
0003
0004     SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
0005     SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
0006     SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
0007
0008     SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010 #include "kcharsets.h"
0011 #include "kcharsets_p.h"
0012 #include "kcodecs_debug.h"
0013
0014 #include "kusasciitextcodec.h"
0015 #include <kentities.h>
0016
0017 #include <QHash>
0018 #include <QTextCodec>
0019
0020 #include <algorithm>
0021 #include <assert.h>
0022
0023 /*
0024  * The encoding names (like "ISO 8859-1") in this list are user-visible,
0025  * and should be mostly uppercase.
0026  * Generate with generate_string_table.pl (located in kde-dev-scripts),
0027  * input data:
0028 ISO 8859-1
0029 i18n:Western European
0030 ISO 8859-15
0031 i18n:Western European
0032 ISO 8859-14
0033 i18n:Western European
0034 cp 1252
0035 i18n:Western European
0036 IBM850
0037 i18n:Western European
0038 ISO 8859-2
0039 i18n:Central European
0040 ISO 8859-3
0041 i18n:Central European
0042 ISO 8859-4
0043 i18n:Baltic
0044 ISO 8859-13
0045 i18n:Baltic
0046 ISO 8859-16
0047 i18n:South-Eastern Europe
0048 cp 1250
0049 i18n:Central European
0050 cp 1254
0051 i18n:Turkish
0052 cp 1257
0053 i18n:Baltic
0054 KOI8-R
0055 i18n:Cyrillic
0056 ISO 8859-5
0057 i18n:Cyrillic
0058 cp 1251
0059 i18n:Cyrillic
0060 KOI8-U
0061 i18n:Cyrillic
0062 IBM866
0063 i18n:Cyrillic
0064 Big5
0065 i18n:Chinese Traditional
0066 Big5-HKSCS
0067 i18n:Chinese Traditional
0068 GB18030
0069 i18n:Chinese Simplified
0070 GBK
0071 i18n:Chinese Simplified
0072 GB2312
0073 i18n:Chinese Simplified
0074 EUC-KR
0075 i18n:Korean
0076 windows-949
0077 i18n:Korean
0078 sjis
0079 i18n:Japanese
0080 ISO-2022-JP
0081 i18n:Japanese
0082 EUC-JP
0083 i18n:Japanese
0084 ISO 8859-7
0085 i18n:Greek
0086 cp 1253
0087 i18n:Greek
0088 ISO 8859-6
0089 i18n:Arabic
0090 cp 1256
0091 i18n:Arabic
0092 ISO 8859-8
0093 i18n:Hebrew
0094 ISO 8859-8-I
0095 i18n:Hebrew
0096 cp 1255
0097 i18n:Hebrew
0098 ISO 8859-9
0099 i18n:Turkish
0100 TIS620
0101 i18n:Thai
0102 ISO 8859-11
0103 i18n:Thai
0104 UTF-8
0105 i18n:Unicode
0106 UTF-16
0107 i18n:Unicode
0108 utf7
0109 i18n:Unicode
0110 ucs2
0111 i18n:Unicode
0112 ISO 10646-UCS-2
0113 i18n:Unicode
0114 windows-1258
0115 i18n:Other
0116 IBM874
0117 i18n:Other
0118 TSCII
0119 i18n:Other
0120  */
0121 /*
0122  * Notes about the table:
0123  *
0124  * - The following entries were disabled and removed from the table:
0125 ibm852
0126 i18n:Central European
0127 pt 154
0128 i18n:Cyrillic              // ### TODO "PT 154" seems to have been removed from Qt
0129  *
0130  * - ISO 8559-11 is the deprecated name of TIS-620
0131  * - utf7 is not in Qt
0132  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
0133  * - windows-1258: TODO
0134  * - IBM874: TODO
0135  * - TSCII: TODO
0136  */
0137
0138 /*
0139  * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
0140  * statically initialised text should be translated so that it expands to just
0141  * the string that should be translated, making it possible to use it in the
0142  * single string construct below.
0143  */
0144 #undef QT_TRANSLATE_NOOP3
0145 #define QT_TRANSLATE_NOOP3(a, b, c) b
0146
0147 /*
0148  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
0149  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
0150  * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
0151  */
0152
0153 static const char language_for_encoding_string[] =
0154     "ISO 8859-1\0"
0155     QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
0156     "ISO 8859-15\0"
0157     "ISO 8859-14\0"
0158     "cp 1252\0"
0159     "IBM850\0"
0160     "ISO 8859-2\0"
0161     QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
0162     "ISO 8859-3\0"
0163     "ISO 8859-4\0"
0164     QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
0165     "ISO 8859-13\0"
0166     "ISO 8859-16\0"
0167     QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
0168     "cp 1250\0"
0169     "cp 1254\0"
0170     QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
0171     "cp 1257\0"
0172     "KOI8-R\0"
0173     QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
0174     "ISO 8859-5\0"
0175     "cp 1251\0"
0176     "KOI8-U\0"
0177     "IBM866\0"
0178     "Big5\0"
0179     QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
0180     "Big5-HKSCS\0"
0181     "GB18030\0"
0182     QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
0183     "GBK\0"
0184     "GB2312\0"
0185     "EUC-KR\0"
0186     QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
0187     "windows-949\0"
0188     "sjis\0"
0189     QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
0190     "ISO-2022-JP\0"
0191     "EUC-JP\0"
0192     "ISO 8859-7\0"
0193     QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
0194     "cp 1253\0"
0195     "ISO 8859-6\0"
0196     QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
0197     "cp 1256\0"
0198     "ISO 8859-8\0"
0199     QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
0200     "ISO 8859-8-I\0"
0201     "cp 1255\0"
0202     "ISO 8859-9\0"
0203     "TIS620\0"
0204     QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
0205     "ISO 8859-11\0"
0206     "UTF-8\0"
0207     QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
0208     "UTF-16\0"
0209     "utf7\0"
0210     "ucs2\0"
0211     "ISO 10646-UCS-2\0"
0212     "windows-1258\0"
0213     QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
0214     "IBM874\0"
0215     "TSCII\0"
0216     "\0";
0217
0218 static const int language_for_encoding_indices[] = {
0219     0,   11,  28,  11,  40,  11,  52,  11,  60,  11,  67,  78,  95,  78,  106, 117, 124, 117, 136, 148, 169, 78,  177, 185, 193, 117, 201, 208, 217, 208, 228,
0220     208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419,
0221     426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1};
0222
0223 /*
0224  * GENERATED CODE ENDS HERE
0225  */
0226
0227 /*
0228  * defines some different names for codecs that are built into Qt.
0229  * The names in this list must be lower-case.
0230  * input data for generate_string_table.pl:
0231 iso-ir-111
0232 koi8-r
0233 koi unified
0234 koi8-r
0235 us-ascii
0236 iso 8859-1
0237 usascii
0238 iso 8859-1
0239 ascii
0240 iso 8859-1
0241 unicode-1-1-utf-7
0242 utf-7
0243 ucs2
0244 iso-10646-ucs-2
0245 iso10646-1
0246 iso-10646-ucs-2
0247 gb18030.2000-1
0248 gb18030
0249 gb18030.2000-0
0250 gb18030
0251 gbk-0
0252 gbk
0253 gb2312
0254 gbk
0255 gb2312.1980-0
0256 gbk
0257 big5-0
0258 big5
0259 euc-kr
0260 euckr
0261 cp 949
0262 windows-949
0263 euc-jp
0264 eucjp
0265 jisx0201.1976-0
0266 eucjp
0267 jisx0208.1983-0
0268 eucjp
0269 jisx0208.1990-0
0270 eucjp
0271 jisx0208.1997-0
0272 eucjp
0273 jisx0212.1990-0
0274 eucjp
0275 jisx0213.2000-1
0276 eucjp
0277 jisx0213.2000-2
0278 eucjp
0279 shift_jis
0280 sjis
0281 shift-jis
0282 sjis
0283 sjis
0284 sjis
0285 iso-2022-jp
0286 jis7
0287 windows850
0288 ibm850
0289 windows866
0290 ibm866
0291 windows-850
0292 ibm850
0293 windows-866
0294 ibm866
0295 cp-10000
0296 apple roman
0297 thai-tis620
0298 iso 8859-11
0299 windows-874
0300 ibm874
0301 windows874
0302 ibm874
0303 cp-874
0304 ibm874
0305 ksc5601.1987-0
0306 euckr
0307 ks_c_5601-1987
0308 euckr
0309 mac-roman
0310 apple roman
0311 macintosh
0312 apple roman
0313 mac
0314 apple roman
0315 csiso2022jp
0316 iso-2022-jp
0317 */
0318 /*
0319  * Notes about the table:
0320  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
0321  * - utf7 is not in Qt
0322  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
0323  * - sjis: appears on the table for x-sjis
0324  * - jis7: ISO-2022-JP is now the default name in Qt4
0325  * - cp-874: is it really needed?
0326  * - mac-roman: appears on the table for x-mac-roman
0327  * - csiso2022jp: See bug #77243
0328  */
0329
0330 /*
0331  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
0332  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
0333  */
0334
0335 static const char builtin_string[] =
0336     "iso-ir-111\0"
0337     "koi8-r\0"
0338     "koi unified\0"
0339     "us-ascii\0"
0340     "iso 8859-1\0"
0341     "usascii\0"
0342     "ascii\0"
0343     "unicode-1-1-utf-7\0"
0344     "utf-7\0"
0345     "ucs2\0"
0346     "iso-10646-ucs-2\0"
0347     "iso10646-1\0"
0348     "gb18030.2000-1\0"
0349     "gb18030\0"
0350     "gb18030.2000-0\0"
0351     "gbk-0\0"
0352     "gbk\0"
0353     "gb2312\0"
0354     "gb2312.1980-0\0"
0355     "big5-0\0"
0356     "big5\0"
0357     "euc-kr\0"
0358     "euckr\0"
0359     "cp 949\0"
0360     "windows-949\0"
0361     "euc-jp\0"
0362     "eucjp\0"
0363     "jisx0201.1976-0\0"
0364     "jisx0208.1983-0\0"
0365     "jisx0208.1990-0\0"
0366     "jisx0208.1997-0\0"
0367     "jisx0212.1990-0\0"
0368     "jisx0213.2000-1\0"
0369     "jisx0213.2000-2\0"
0370     "shift_jis\0"
0371     "sjis\0"
0372     "shift-jis\0"
0373     "iso-2022-jp\0"
0374     "jis7\0"
0375     "windows850\0"
0376     "ibm850\0"
0377     "windows866\0"
0378     "ibm866\0"
0379     "windows-850\0"
0380     "windows-866\0"
0381     "cp-10000\0"
0382     "apple roman\0"
0383     "thai-tis620\0"
0384     "iso 8859-11\0"
0385     "windows-874\0"
0386     "ibm874\0"
0387     "windows874\0"
0388     "cp-874\0"
0389     "ksc5601.1987-0\0"
0390     "ks_c_5601-1987\0"
0391     "mac-roman\0"
0392     "macintosh\0"
0393     "mac\0"
0394     "csiso2022jp\0"
0395     "\0";
0396
0397 static const int builtin_indices[] = {0,   11,  18,  11,  30,  39,  50,  39,  58,  39,  64,  82,  88,  93,  109, 93,  120, 135, 143, 135, 158, 164,
0398                                       168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240,
0399                                       326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469,
0400                                       481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1};
0401
0402 /*
0403  * GENERATED CODE ENDS HERE
0404  */
0405
0406 /*
0407  * some last resort hints in case the charmap file couldn't be found.
0408  * This gives at least a partial conversion and helps making things readable.
0409  *
0410  * the name used as input here is already converted to the more canonical
0411  * name as defined in the aliases array.
0412  *
0413  * Input data:
0414 cp1250
0415 iso-8859-2
0416 koi8-r
0417 iso-8859-5
0418 koi8-u
0419 koi8-r
0420 pt 154
0421 windows-1251
0422 paratype-154
0423 windows-1251
0424 pt-154
0425 windows-1251
0426  */
0427 /* Notes:
0428  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
0429  */
0430
0431 /*
0432  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
0433  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
0434  */
0435
0436 static const char conversion_hints_string[] =
0437     "cp1250\0"
0438     "iso-8859-2\0"
0439     "koi8-r\0"
0440     "iso-8859-5\0"
0441     "koi8-u\0"
0442     "pt 154\0"
0443     "windows-1251\0"
0444     "paratype-154\0"
0445     "pt-154\0"
0446     "\0";
0447
0448 static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1};
0449
0450 /*
0451  * GENERATED CODE ENDS HERE
0452  */
0453
0454 struct KCharsetsSingletonPrivate {
0455     KCharsets instance;
0456 };
0457
0458 Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
0459
0460 // search an array of items index/data, find first matching index
0461 // and return data, or return 0
0462 static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
0463 {
0464     for (int i = 0; indices[i] != -1; i += 2) {
0465         if (qstrcmp(start + indices[i], entry) == 0) {
0466             return start + indices[i + 1];
0467         }
0468     }
0469     return nullptr;
0470 }
0471
0472 bool KCharsetsPrivate::isUsAsciiTextCodecRequest(const QByteArray &name) const
0473 {
0474     if (usAsciiTextCodec->name().compare(name, Qt::CaseInsensitive) == 0) {
0475         return true;
0476     }
0477     const QList<QByteArray> aliases = usAsciiTextCodec->aliases();
0478     return std::any_of(aliases.constBegin(), aliases.constEnd(), [name](const QByteArray &aliasName) {
0479         return (aliasName.compare(name, Qt::CaseInsensitive) == 0);
0480     });
0481 }
0482
0483 // --------------------------------------------------------------------------
0484
0485 KCharsets::KCharsets()
0486     : d(new KCharsetsPrivate(this))
0487 {
0488 }
0489
0490 KCharsets::~KCharsets() = default;
0491
0492 QChar KCharsets::fromEntity(const QString &str)
0493 {
0494     QChar res = QChar::Null;
0495
0496     if (str.isEmpty()) {
0497         return QChar::Null;
0498     }
0499
0500     int pos = 0;
0501     if (str[pos] == QLatin1Char('&')) {
0502         pos++;
0503     }
0504
0505     // Check for '&#000' or '&#x0000' sequence
0506     if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) {
0507         bool ok;
0508         pos++;
0509         if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
0510             pos++;
0511             // '&#x0000', hexadecimal character reference
0512             const QString tmp(str.mid(pos));
0513             res = QChar(tmp.toInt(&ok, 16));
0514         } else {
0515             //  '&#0000', decimal character reference
0516             const QString tmp(str.mid(pos));
0517             res = QChar(tmp.toInt(&ok, 10));
0518         }
0519         if (ok) {
0520             return res;
0521         } else {
0522             return QChar::Null;
0523         }
0524     }
0525
0526     const QByteArray raw(str.toLatin1());
0527     const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length());
0528
0529     if (!e) {
0530         // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
0531         return QChar::Null;
0532     }
0533     // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
0534
0535     return QChar(e->code);
0536 }
0537
0538 QChar KCharsets::fromEntity(const QString &str, int &len)
0539 {
0540     // entities are never longer than 8 chars... we start from
0541     // that length and work backwards...
0542     len = 8;
0543     while (len > 0) {
0544         QString tmp = str.left(len);
0545         QChar res = fromEntity(tmp);
0546         if (res != QChar::Null) {
0547             return res;
0548         }
0549         len--;
0550     }
0551     return QChar::Null;
0552 }
0553
0554 QString KCharsets::toEntity(const QChar &ch)
0555 {
0556     return QString::asprintf("&#0x%x;", ch.unicode());
0557 }
0558
0559 QString KCharsets::resolveEntities(const QString &input)
0560 {
0561     QString text = input;
0562     const QChar *p = text.unicode();
0563     const QChar *end = p + text.length();
0564     const QChar *ampersand = nullptr;
0565     bool scanForSemicolon = false;
0566
0567     for (; p < end; ++p) {
0568         const QChar ch = *p;
0569
0570         if (ch == QLatin1Char('&')) {
0571             ampersand = p;
0572             scanForSemicolon = true;
0573             continue;
0574         }
0575
0576         if (ch != QLatin1Char(';') || scanForSemicolon == false) {
0577             continue;
0578         }
0579
0580         assert(ampersand);
0581
0582         scanForSemicolon = false;
0583
0584         const QChar *entityBegin = ampersand + 1;
0585
0586         const uint entityLength = p - entityBegin;
0587         if (entityLength == 0) {
0588             continue;
0589         }
0590
0591         const QChar entityValue = KCharsets::fromEntity(QString(entityBegin, entityLength));
0592         if (entityValue.isNull()) {
0593             continue;
0594         }
0595
0596         const uint ampersandPos = ampersand - text.unicode();
0597
0598         text[(int)ampersandPos] = entityValue;
0599         text.remove(ampersandPos + 1, entityLength + 1);
0600         p = text.unicode() + ampersandPos;
0601         end = text.unicode() + text.length();
0602         ampersand = nullptr;
0603     }
0604
0605     return text;
0606 }
0607
0608 QStringList KCharsets::availableEncodingNames() const
0609 {
0610     QStringList available;
0611     for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
0612         available.append(QString::fromUtf8(language_for_encoding_string + *p));
0613     }
0614     available.sort();
0615     return available;
0616 }
0617
0618 QString KCharsets::descriptionForEncoding(const QString &encoding) const
0619 {
0620     const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data());
0621     if (lang) {
0622         return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding);
0623     } else {
0624         return tr("Other encoding (%1)", "@item").arg(encoding);
0625     }
0626 }
0627
0628 QString KCharsets::encodingForName(const QString &descriptiveName) const
0629 {
0630     const int left = descriptiveName.lastIndexOf(QLatin1Char('('));
0631
0632     if (left < 0) { // No parenthesis, so assume it is a normal encoding name
0633         return descriptiveName.trimmed();
0634     }
0635
0636     QString name(descriptiveName.mid(left + 1));
0637
0638     const int right = name.lastIndexOf(QLatin1Char(')'));
0639
0640     if (right < 0) {
0641         return name;
0642     }
0643
0644     return name.left(right).trimmed();
0645 }
0646
0647 QStringList KCharsets::descriptiveEncodingNames() const
0648 {
0649     QStringList encodings;
0650     for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
0651         const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
0652         const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
0653         encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name));
0654     }
0655     encodings.sort();
0656     return encodings;
0657 }
0658
0659 QList<QStringList> KCharsets::encodingsByScript() const
0660 {
0661     if (!d->encodingsByScript.isEmpty()) {
0662         return d->encodingsByScript;
0663     }
0664     int i;
0665     for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
0666         const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
0667         const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
0668
0669         for (i = 0; i < d->encodingsByScript.size(); ++i) {
0670             if (d->encodingsByScript.at(i).at(0) == description) {
0671                 d->encodingsByScript[i].append(name);
0672                 break;
0673             }
0674         }
0675
0676         if (i == d->encodingsByScript.size()) {
0677             d->encodingsByScript.append(QStringList() << description << name);
0678         }
0679     }
0680     return d->encodingsByScript;
0681 }
0682
0683 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101)
0684 QTextCodec *KCharsets::codecForName(const QString &n) const
0685 {
0686     return d->codecForName(n);
0687 }
0688 #endif
0689
0690 QTextCodec *KCharsetsPrivate::codecForName(const QString &n)
0691 {
0692     if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
0693         return QTextCodec::codecForName("gb18030");
0694     }
0695     const QByteArray name(n.toLatin1());
0696     QTextCodec *codec = codecForNameOrNull(name);
0697     if (codec) {
0698         return codec;
0699     } else {
0700         return QTextCodec::codecForName("iso-8859-1");
0701     }
0702 }
0703
0704 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101)
0705 QTextCodec *KCharsets::codecForName(const QString &n, bool &ok) const
0706 {
0707     return d->codecForName(n, ok);
0708 };
0709 #endif
0710
0711 QTextCodec *KCharsetsPrivate::codecForName(const QString &n, bool &ok)
0712 {
0713     if (n == QLatin1String("gb2312") || n == QLatin1String("gbk")) {
0714         ok = true;
0715         return QTextCodec::codecForName("gb18030");
0716     }
0717     const QByteArray name(n.toLatin1());
0718     QTextCodec *codec = codecForNameOrNull(name);
0719     if (codec) {
0720         ok = true;
0721         return codec;
0722     } else {
0723         ok = false;
0724         return QTextCodec::codecForName("iso-8859-1");
0725     }
0726 }
0727
0728 #if KCODECS_BUILD_DEPRECATED_SINCE(5, 101)
0729 QTextCodec *KCharsets::codecForNameOrNull(const QByteArray &n) const
0730 {
0731     return d->codecForNameOrNull(n);
0732 }
0733 #endif
0734
0735 QTextCodec *KCharsetsPrivate::codecForNameOrNull(const QByteArray &n)
0736 {
0737     QTextCodec *codec = nullptr;
0738
0739     if (n.isEmpty()) {
0740         // TODO: Any better ideas ?
0741         // No name, assume system locale
0742         const QByteArray locale = "->locale<-";
0743         if (codecForNameDict.contains(locale)) {
0744             return codecForNameDict.value(locale);
0745         }
0746         codec = QTextCodec::codecForLocale();
0747         codecForNameDict.insert("->locale<-", codec);
0748         return codec;
0749     }
0750     // For a non-empty name, lookup the "dictionary", in a case-sensitive way.
0751     else if (codecForNameDict.contains(n)) {
0752         return codecForNameDict.value(n);
0753     }
0754
0755     // If the name is not in the hash table,
0756     // first check ourselves if our fixed variant of a US-ASCII codec should be returned:
0757     // API docs of QTextCodec do not specify the handling of custom codec instances
0758     // on look-up when there are multiple codecs supporting the same name.
0759     // The code of Qt 5.15 prepends custom instances to the internal list,
0760     // so they would be preferred initially.
0761     // But the code also has a look-up cache which does not get updated on new instances,
0762     // so if somewhere a US-ASCII codec was requested by some other code before
0763     // our KUsAsciiTextCodec instance gets created, the Qt-built-in will be always
0764     // picked instead, at least for the used name.
0765     // So we cannot rely on the internal mechanisms, but have to prefer our codec ourselves.
0766     if (isUsAsciiTextCodecRequest(n)) {
0767         codec = usAsciiTextCodec;
0768     } else {
0769         // call directly QTextCodec::codecForName.
0770         // We assume that QTextCodec is smarter and more maintained than this code.
0771         codec = QTextCodec::codecForName(n);
0772     }
0773
0774     if (codec) {
0775         codecForNameDict.insert(n, codec);
0776         return codec;
0777     }
0778
0779     // We have had no luck with QTextCodec::codecForName, so we must now process the name, so that QTextCodec::codecForName could work with it.
0780
0781     QByteArray name = n.toLower();
0782     bool changed = false;
0783     if (name.endsWith("_charset")) { // krazy:exclude=strings
0784         name.chop(8);
0785         changed = true;
0786     }
0787     if (name.startsWith("x-")) { // krazy:exclude=strings
0788         name.remove(0, 2); // remove x- at start
0789         changed = true;
0790     }
0791
0792     if (name.isEmpty()) {
0793         // We have no name anymore, therefore the name is invalid.
0794         return nullptr;
0795     }
0796
0797     // We only need to check changed names.
0798     if (changed) {
0799         codec = QTextCodec::codecForName(name);
0800         if (codec) {
0801             codecForNameDict.insert(n, codec);
0802             return codec;
0803         }
0804     }
0805
0806     // these codecs are built into Qt, but the name given for the codec is different,
0807     // so QTextCodec did not recognize it.
0808     QByteArray cname = kcharsets_array_search(builtin_string, builtin_indices, name.data());
0809
0810     if (!cname.isEmpty()) {
0811         codec = QTextCodec::codecForName(cname);
0812     }
0813
0814     if (codec) {
0815         codecForNameDict.insert(n, codec);
0816         return codec;
0817     }
0818
0819     // this also failed, the last resort is now to take some compatibility charmap
0820     // ### TODO: while emergency conversions might be useful at read, it is not sure if they should be done if the application plans to write.
0821     cname = kcharsets_array_search(conversion_hints_string, conversion_hints_indices, name.data());
0822
0823     if (!cname.isEmpty()) {
0824         codec = QTextCodec::codecForName(cname);
0825         if (codec) {
0826             codecForNameDict.insert(n, codec);
0827             return codec;
0828         }
0829     }
0830
0831     // we could not assign a codec, therefore return NULL
0832     return nullptr;
0833 }
0834
0835 KCharsets *KCharsets::charsets()
0836 {
0837     return &globalCharsets()->instance;
0838 }