File indexing completed on 2024-04-28 07:41:30
0001 /* 0002 This file is part of the KDE libraries 0003 0004 SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org> 0005 SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org> 0006 SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net> 0007 0008 SPDX-License-Identifier: LGPL-2.0-or-later 0009 */ 0010 #include "kcharsets.h" 0011 #include "kcharsets_p.h" 0012 #include "kcodecs_debug.h" 0013 0014 #include <kentities.h> 0015 0016 #include <QHash> 0017 0018 #include <algorithm> 0019 #include <assert.h> 0020 0021 /* 0022 * The encoding names (like "ISO 8859-1") in this list are user-visible, 0023 * and should be mostly uppercase. 0024 * Generate with generate_string_table.pl (located in kde-dev-scripts), 0025 * input data: 0026 ISO 8859-1 0027 i18n:Western European 0028 ISO 8859-15 0029 i18n:Western European 0030 ISO 8859-14 0031 i18n:Western European 0032 cp 1252 0033 i18n:Western European 0034 IBM850 0035 i18n:Western European 0036 ISO 8859-2 0037 i18n:Central European 0038 ISO 8859-3 0039 i18n:Central European 0040 ISO 8859-4 0041 i18n:Baltic 0042 ISO 8859-13 0043 i18n:Baltic 0044 ISO 8859-16 0045 i18n:South-Eastern Europe 0046 cp 1250 0047 i18n:Central European 0048 cp 1254 0049 i18n:Turkish 0050 cp 1257 0051 i18n:Baltic 0052 KOI8-R 0053 i18n:Cyrillic 0054 ISO 8859-5 0055 i18n:Cyrillic 0056 cp 1251 0057 i18n:Cyrillic 0058 KOI8-U 0059 i18n:Cyrillic 0060 IBM866 0061 i18n:Cyrillic 0062 Big5 0063 i18n:Chinese Traditional 0064 Big5-HKSCS 0065 i18n:Chinese Traditional 0066 GB18030 0067 i18n:Chinese Simplified 0068 GBK 0069 i18n:Chinese Simplified 0070 GB2312 0071 i18n:Chinese Simplified 0072 EUC-KR 0073 i18n:Korean 0074 windows-949 0075 i18n:Korean 0076 sjis 0077 i18n:Japanese 0078 ISO-2022-JP 0079 i18n:Japanese 0080 EUC-JP 0081 i18n:Japanese 0082 ISO 8859-7 0083 i18n:Greek 0084 cp 1253 0085 i18n:Greek 0086 ISO 8859-6 0087 i18n:Arabic 0088 cp 1256 0089 i18n:Arabic 0090 ISO 8859-8 0091 i18n:Hebrew 0092 ISO 8859-8-I 0093 i18n:Hebrew 0094 cp 1255 0095 i18n:Hebrew 0096 ISO 8859-9 0097 i18n:Turkish 0098 TIS620 0099 i18n:Thai 0100 ISO 8859-11 0101 i18n:Thai 0102 UTF-8 0103 i18n:Unicode 0104 UTF-16 0105 i18n:Unicode 0106 utf7 0107 i18n:Unicode 0108 ucs2 0109 i18n:Unicode 0110 ISO 10646-UCS-2 0111 i18n:Unicode 0112 windows-1258 0113 i18n:Other 0114 IBM874 0115 i18n:Other 0116 TSCII 0117 i18n:Other 0118 */ 0119 /* 0120 * Notes about the table: 0121 * 0122 * - The following entries were disabled and removed from the table: 0123 ibm852 0124 i18n:Central European 0125 pt 154 0126 i18n:Cyrillic // ### TODO "PT 154" seems to have been removed from Qt 0127 * 0128 * - ISO 8559-11 is the deprecated name of TIS-620 0129 * - utf7 is not in Qt 0130 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 0131 * - windows-1258: TODO 0132 * - IBM874: TODO 0133 * - TSCII: TODO 0134 */ 0135 0136 /* 0137 * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that 0138 * statically initialised text should be translated so that it expands to just 0139 * the string that should be translated, making it possible to use it in the 0140 * single string construct below. 0141 */ 0142 #undef QT_TRANSLATE_NOOP3 0143 #define QT_TRANSLATE_NOOP3(a, b, c) b 0144 0145 /* 0146 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. 0147 * The script used was generate_string_table.pl which can be found in kde-dev-scripts. 0148 * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP. 0149 */ 0150 0151 static const char language_for_encoding_string[] = 0152 "ISO 8859-1\0" 0153 QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0" 0154 "ISO 8859-15\0" 0155 "ISO 8859-14\0" 0156 "cp 1252\0" 0157 "IBM850\0" 0158 "ISO 8859-2\0" 0159 QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0" 0160 "ISO 8859-3\0" 0161 "ISO 8859-4\0" 0162 QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0" 0163 "ISO 8859-13\0" 0164 "ISO 8859-16\0" 0165 QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0" 0166 "cp 1250\0" 0167 "cp 1254\0" 0168 QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0" 0169 "cp 1257\0" 0170 "KOI8-R\0" 0171 QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0" 0172 "ISO 8859-5\0" 0173 "cp 1251\0" 0174 "KOI8-U\0" 0175 "IBM866\0" 0176 "Big5\0" 0177 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0" 0178 "Big5-HKSCS\0" 0179 "GB18030\0" 0180 QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0" 0181 "GBK\0" 0182 "GB2312\0" 0183 "EUC-KR\0" 0184 QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0" 0185 "windows-949\0" 0186 "sjis\0" 0187 QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0" 0188 "ISO-2022-JP\0" 0189 "EUC-JP\0" 0190 "ISO 8859-7\0" 0191 QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0" 0192 "cp 1253\0" 0193 "ISO 8859-6\0" 0194 QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0" 0195 "cp 1256\0" 0196 "ISO 8859-8\0" 0197 QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0" 0198 "ISO 8859-8-I\0" 0199 "cp 1255\0" 0200 "ISO 8859-9\0" 0201 "TIS620\0" 0202 QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0" 0203 "ISO 8859-11\0" 0204 "UTF-8\0" 0205 QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0" 0206 "UTF-16\0" 0207 "utf7\0" 0208 "ucs2\0" 0209 "ISO 10646-UCS-2\0" 0210 "windows-1258\0" 0211 QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0" 0212 "IBM874\0" 0213 "TSCII\0" 0214 "\0"; 0215 0216 static const int language_for_encoding_indices[] = { 0217 0, 11, 28, 11, 40, 11, 52, 11, 60, 11, 67, 78, 95, 78, 106, 117, 124, 117, 136, 148, 169, 78, 177, 185, 193, 117, 201, 208, 217, 208, 228, 0218 208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419, 0219 426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1}; 0220 0221 /* 0222 * GENERATED CODE ENDS HERE 0223 */ 0224 0225 /* 0226 * defines some different names for codecs that are built into Qt. 0227 * The names in this list must be lower-case. 0228 * input data for generate_string_table.pl: 0229 iso-ir-111 0230 koi8-r 0231 koi unified 0232 koi8-r 0233 us-ascii 0234 iso 8859-1 0235 usascii 0236 iso 8859-1 0237 ascii 0238 iso 8859-1 0239 unicode-1-1-utf-7 0240 utf-7 0241 ucs2 0242 iso-10646-ucs-2 0243 iso10646-1 0244 iso-10646-ucs-2 0245 gb18030.2000-1 0246 gb18030 0247 gb18030.2000-0 0248 gb18030 0249 gbk-0 0250 gbk 0251 gb2312 0252 gbk 0253 gb2312.1980-0 0254 gbk 0255 big5-0 0256 big5 0257 euc-kr 0258 euckr 0259 cp 949 0260 windows-949 0261 euc-jp 0262 eucjp 0263 jisx0201.1976-0 0264 eucjp 0265 jisx0208.1983-0 0266 eucjp 0267 jisx0208.1990-0 0268 eucjp 0269 jisx0208.1997-0 0270 eucjp 0271 jisx0212.1990-0 0272 eucjp 0273 jisx0213.2000-1 0274 eucjp 0275 jisx0213.2000-2 0276 eucjp 0277 shift_jis 0278 sjis 0279 shift-jis 0280 sjis 0281 sjis 0282 sjis 0283 iso-2022-jp 0284 jis7 0285 windows850 0286 ibm850 0287 windows866 0288 ibm866 0289 windows-850 0290 ibm850 0291 windows-866 0292 ibm866 0293 cp-10000 0294 apple roman 0295 thai-tis620 0296 iso 8859-11 0297 windows-874 0298 ibm874 0299 windows874 0300 ibm874 0301 cp-874 0302 ibm874 0303 ksc5601.1987-0 0304 euckr 0305 ks_c_5601-1987 0306 euckr 0307 mac-roman 0308 apple roman 0309 macintosh 0310 apple roman 0311 mac 0312 apple roman 0313 csiso2022jp 0314 iso-2022-jp 0315 */ 0316 /* 0317 * Notes about the table: 0318 * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set) 0319 * - utf7 is not in Qt 0320 * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2" 0321 * - sjis: appears on the table for x-sjis 0322 * - jis7: ISO-2022-JP is now the default name in Qt4 0323 * - cp-874: is it really needed? 0324 * - mac-roman: appears on the table for x-mac-roman 0325 * - csiso2022jp: See bug #77243 0326 */ 0327 0328 /* 0329 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. 0330 * The script used was generate_string_table.pl which can be found in kde-dev-scripts. 0331 */ 0332 0333 static const char builtin_string[] = 0334 "iso-ir-111\0" 0335 "koi8-r\0" 0336 "koi unified\0" 0337 "us-ascii\0" 0338 "iso 8859-1\0" 0339 "usascii\0" 0340 "ascii\0" 0341 "unicode-1-1-utf-7\0" 0342 "utf-7\0" 0343 "ucs2\0" 0344 "iso-10646-ucs-2\0" 0345 "iso10646-1\0" 0346 "gb18030.2000-1\0" 0347 "gb18030\0" 0348 "gb18030.2000-0\0" 0349 "gbk-0\0" 0350 "gbk\0" 0351 "gb2312\0" 0352 "gb2312.1980-0\0" 0353 "big5-0\0" 0354 "big5\0" 0355 "euc-kr\0" 0356 "euckr\0" 0357 "cp 949\0" 0358 "windows-949\0" 0359 "euc-jp\0" 0360 "eucjp\0" 0361 "jisx0201.1976-0\0" 0362 "jisx0208.1983-0\0" 0363 "jisx0208.1990-0\0" 0364 "jisx0208.1997-0\0" 0365 "jisx0212.1990-0\0" 0366 "jisx0213.2000-1\0" 0367 "jisx0213.2000-2\0" 0368 "shift_jis\0" 0369 "sjis\0" 0370 "shift-jis\0" 0371 "iso-2022-jp\0" 0372 "jis7\0" 0373 "windows850\0" 0374 "ibm850\0" 0375 "windows866\0" 0376 "ibm866\0" 0377 "windows-850\0" 0378 "windows-866\0" 0379 "cp-10000\0" 0380 "apple roman\0" 0381 "thai-tis620\0" 0382 "iso 8859-11\0" 0383 "windows-874\0" 0384 "ibm874\0" 0385 "windows874\0" 0386 "cp-874\0" 0387 "ksc5601.1987-0\0" 0388 "ks_c_5601-1987\0" 0389 "mac-roman\0" 0390 "macintosh\0" 0391 "mac\0" 0392 "csiso2022jp\0" 0393 "\0"; 0394 0395 static const int builtin_indices[] = {0, 11, 18, 11, 30, 39, 50, 39, 58, 39, 64, 82, 88, 93, 109, 93, 120, 135, 143, 135, 158, 164, 0396 168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240, 0397 326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469, 0398 481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1}; 0399 0400 /* 0401 * GENERATED CODE ENDS HERE 0402 */ 0403 0404 /* 0405 * some last resort hints in case the charmap file couldn't be found. 0406 * This gives at least a partial conversion and helps make things readable. 0407 * 0408 * the name used as input here is already converted to the more canonical 0409 * name as defined in the aliases array. 0410 * 0411 * Input data: 0412 cp1250 0413 iso-8859-2 0414 koi8-r 0415 iso-8859-5 0416 koi8-u 0417 koi8-r 0418 pt 154 0419 windows-1251 0420 paratype-154 0421 windows-1251 0422 pt-154 0423 windows-1251 0424 */ 0425 /* Notes: 0426 * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback. 0427 */ 0428 0429 /* 0430 * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND. 0431 * The script used was generate_string_table.pl which can be found in kde-dev-scripts. 0432 */ 0433 0434 static const char conversion_hints_string[] = 0435 "cp1250\0" 0436 "iso-8859-2\0" 0437 "koi8-r\0" 0438 "iso-8859-5\0" 0439 "koi8-u\0" 0440 "pt 154\0" 0441 "windows-1251\0" 0442 "paratype-154\0" 0443 "pt-154\0" 0444 "\0"; 0445 0446 static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1}; 0447 0448 /* 0449 * GENERATED CODE ENDS HERE 0450 */ 0451 0452 struct KCharsetsSingletonPrivate { 0453 KCharsets instance; 0454 }; 0455 0456 Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets) 0457 0458 // search an array of items index/data, find first matching index 0459 // and return data, or return 0 0460 static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry) 0461 { 0462 for (int i = 0; indices[i] != -1; i += 2) { 0463 if (qstrcmp(start + indices[i], entry) == 0) { 0464 return start + indices[i + 1]; 0465 } 0466 } 0467 return nullptr; 0468 } 0469 0470 // -------------------------------------------------------------------------- 0471 0472 KCharsets::KCharsets() 0473 : d(new KCharsetsPrivate) 0474 { 0475 } 0476 0477 KCharsets::~KCharsets() = default; 0478 0479 QChar KCharsets::fromEntity(QStringView str) 0480 { 0481 QChar res = QChar::Null; 0482 0483 if (str.isEmpty()) { 0484 return QChar::Null; 0485 } 0486 0487 int pos = 0; 0488 if (str[pos] == QLatin1Char('&')) { 0489 pos++; 0490 } 0491 0492 // Check for '�' or '�' sequence 0493 if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) { 0494 bool ok; 0495 pos++; 0496 if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) { 0497 pos++; 0498 // '�', hexadecimal character reference 0499 const auto tmp = str.mid(pos); 0500 res = QChar(tmp.toInt(&ok, 16)); 0501 } else { 0502 // '�', decimal character reference 0503 const auto tmp = str.mid(pos); 0504 res = QChar(tmp.toInt(&ok, 10)); 0505 } 0506 if (ok) { 0507 return res; 0508 } else { 0509 return QChar::Null; 0510 } 0511 } 0512 0513 const QByteArray raw(str.toLatin1()); 0514 const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length()); 0515 0516 if (!e) { 0517 // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length(); 0518 return QChar::Null; 0519 } 0520 // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code; 0521 0522 return QChar(e->code); 0523 } 0524 0525 QChar KCharsets::fromEntity(QStringView str, int &len) 0526 { 0527 // entities are never longer than 8 chars... we start from 0528 // that length and work backwards... 0529 len = 8; 0530 while (len > 0) { 0531 const auto tmp = str.left(len); 0532 QChar res = fromEntity(tmp); 0533 if (res != QChar::Null) { 0534 return res; 0535 } 0536 len--; 0537 } 0538 return QChar::Null; 0539 } 0540 0541 QString KCharsets::toEntity(const QChar &ch) 0542 { 0543 return QString::asprintf("�x%x;", ch.unicode()); 0544 } 0545 0546 QString KCharsets::resolveEntities(const QString &input) 0547 { 0548 QString text = input; 0549 const QChar *p = text.unicode(); 0550 const QChar *end = p + text.length(); 0551 const QChar *ampersand = nullptr; 0552 bool scanForSemicolon = false; 0553 0554 for (; p < end; ++p) { 0555 const QChar ch = *p; 0556 0557 if (ch == QLatin1Char('&')) { 0558 ampersand = p; 0559 scanForSemicolon = true; 0560 continue; 0561 } 0562 0563 if (ch != QLatin1Char(';') || scanForSemicolon == false) { 0564 continue; 0565 } 0566 0567 assert(ampersand); 0568 0569 scanForSemicolon = false; 0570 0571 const QChar *entityBegin = ampersand + 1; 0572 0573 const uint entityLength = p - entityBegin; 0574 if (entityLength == 0) { 0575 continue; 0576 } 0577 0578 const QChar entityValue = KCharsets::fromEntity(QStringView(entityBegin, entityLength)); 0579 if (entityValue.isNull()) { 0580 continue; 0581 } 0582 0583 const uint ampersandPos = ampersand - text.unicode(); 0584 0585 text[(int)ampersandPos] = entityValue; 0586 text.remove(ampersandPos + 1, entityLength + 1); 0587 p = text.unicode() + ampersandPos; 0588 end = text.unicode() + text.length(); 0589 ampersand = nullptr; 0590 } 0591 0592 return text; 0593 } 0594 0595 QStringList KCharsets::availableEncodingNames() const 0596 { 0597 QStringList available; 0598 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { 0599 available.append(QString::fromUtf8(language_for_encoding_string + *p)); 0600 } 0601 available.sort(); 0602 return available; 0603 } 0604 0605 QString KCharsets::descriptionForEncoding(QStringView encoding) const 0606 { 0607 const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data()); 0608 if (lang) { 0609 return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding); 0610 } else { 0611 return tr("Other encoding (%1)", "@item").arg(encoding); 0612 } 0613 } 0614 0615 QString KCharsets::encodingForName(const QString &descriptiveName) const 0616 { 0617 const int left = descriptiveName.lastIndexOf(QLatin1Char('(')); 0618 0619 if (left < 0) { // No parenthesis, so assume it is a normal encoding name 0620 return descriptiveName.trimmed(); 0621 } 0622 0623 QString name(descriptiveName.mid(left + 1)); 0624 0625 const int right = name.lastIndexOf(QLatin1Char(')')); 0626 0627 if (right < 0) { 0628 return name; 0629 } 0630 0631 return name.left(right).trimmed(); 0632 } 0633 0634 QStringList KCharsets::descriptiveEncodingNames() const 0635 { 0636 QStringList encodings; 0637 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { 0638 const QString name = QString::fromUtf8(language_for_encoding_string + p[0]); 0639 const QString description = tr(language_for_encoding_string + p[1], "@item Text character set"); 0640 encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name)); 0641 } 0642 encodings.sort(); 0643 return encodings; 0644 } 0645 0646 QList<QStringList> KCharsets::encodingsByScript() const 0647 { 0648 if (!d->encodingsByScript.isEmpty()) { 0649 return d->encodingsByScript; 0650 } 0651 int i; 0652 for (const int *p = language_for_encoding_indices; *p != -1; p += 2) { 0653 const QString name = QString::fromUtf8(language_for_encoding_string + p[0]); 0654 const QString description = tr(language_for_encoding_string + p[1], "@item Text character set"); 0655 0656 for (i = 0; i < d->encodingsByScript.size(); ++i) { 0657 if (d->encodingsByScript.at(i).at(0) == description) { 0658 d->encodingsByScript[i].append(name); 0659 break; 0660 } 0661 } 0662 0663 if (i == d->encodingsByScript.size()) { 0664 d->encodingsByScript.append(QStringList() << description << name); 0665 } 0666 } 0667 return d->encodingsByScript; 0668 } 0669 0670 KCharsets *KCharsets::charsets() 0671 { 0672 return &globalCharsets()->instance; 0673 }