kcodecs/src/kcharsets.cpp

0001 /*
0002     This file is part of the KDE libraries
0003
0004     SPDX-FileCopyrightText: 1999 Lars Knoll <knoll@kde.org>
0005     SPDX-FileCopyrightText: 2001, 2003, 2004, 2005, 2006 Nicolas GOUTTE <goutte@kde.org>
0006     SPDX-FileCopyrightText: 2007 Nick Shaforostoff <shafff@ukr.net>
0007
0008     SPDX-License-Identifier: LGPL-2.0-or-later
0009 */
0010 #include "kcharsets.h"
0011 #include "kcharsets_p.h"
0012 #include "kcodecs_debug.h"
0013
0014 #include <kentities.h>
0015
0016 #include <QHash>
0017
0018 #include <algorithm>
0019 #include <assert.h>
0020
0021 /*
0022  * The encoding names (like "ISO 8859-1") in this list are user-visible,
0023  * and should be mostly uppercase.
0024  * Generate with generate_string_table.pl (located in kde-dev-scripts),
0025  * input data:
0026 ISO 8859-1
0027 i18n:Western European
0028 ISO 8859-15
0029 i18n:Western European
0030 ISO 8859-14
0031 i18n:Western European
0032 cp 1252
0033 i18n:Western European
0034 IBM850
0035 i18n:Western European
0036 ISO 8859-2
0037 i18n:Central European
0038 ISO 8859-3
0039 i18n:Central European
0040 ISO 8859-4
0041 i18n:Baltic
0042 ISO 8859-13
0043 i18n:Baltic
0044 ISO 8859-16
0045 i18n:South-Eastern Europe
0046 cp 1250
0047 i18n:Central European
0048 cp 1254
0049 i18n:Turkish
0050 cp 1257
0051 i18n:Baltic
0052 KOI8-R
0053 i18n:Cyrillic
0054 ISO 8859-5
0055 i18n:Cyrillic
0056 cp 1251
0057 i18n:Cyrillic
0058 KOI8-U
0059 i18n:Cyrillic
0060 IBM866
0061 i18n:Cyrillic
0062 Big5
0063 i18n:Chinese Traditional
0064 Big5-HKSCS
0065 i18n:Chinese Traditional
0066 GB18030
0067 i18n:Chinese Simplified
0068 GBK
0069 i18n:Chinese Simplified
0070 GB2312
0071 i18n:Chinese Simplified
0072 EUC-KR
0073 i18n:Korean
0074 windows-949
0075 i18n:Korean
0076 sjis
0077 i18n:Japanese
0078 ISO-2022-JP
0079 i18n:Japanese
0080 EUC-JP
0081 i18n:Japanese
0082 ISO 8859-7
0083 i18n:Greek
0084 cp 1253
0085 i18n:Greek
0086 ISO 8859-6
0087 i18n:Arabic
0088 cp 1256
0089 i18n:Arabic
0090 ISO 8859-8
0091 i18n:Hebrew
0092 ISO 8859-8-I
0093 i18n:Hebrew
0094 cp 1255
0095 i18n:Hebrew
0096 ISO 8859-9
0097 i18n:Turkish
0098 TIS620
0099 i18n:Thai
0100 ISO 8859-11
0101 i18n:Thai
0102 UTF-8
0103 i18n:Unicode
0104 UTF-16
0105 i18n:Unicode
0106 utf7
0107 i18n:Unicode
0108 ucs2
0109 i18n:Unicode
0110 ISO 10646-UCS-2
0111 i18n:Unicode
0112 windows-1258
0113 i18n:Other
0114 IBM874
0115 i18n:Other
0116 TSCII
0117 i18n:Other
0118  */
0119 /*
0120  * Notes about the table:
0121  *
0122  * - The following entries were disabled and removed from the table:
0123 ibm852
0124 i18n:Central European
0125 pt 154
0126 i18n:Cyrillic              // ### TODO "PT 154" seems to have been removed from Qt
0127  *
0128  * - ISO 8559-11 is the deprecated name of TIS-620
0129  * - utf7 is not in Qt
0130  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
0131  * - windows-1258: TODO
0132  * - IBM874: TODO
0133  * - TSCII: TODO
0134  */
0135
0136 /*
0137  * This redefines the QT_TRANSLATE_NOOP3 macro provided by Qt to indicate that
0138  * statically initialised text should be translated so that it expands to just
0139  * the string that should be translated, making it possible to use it in the
0140  * single string construct below.
0141  */
0142 #undef QT_TRANSLATE_NOOP3
0143 #define QT_TRANSLATE_NOOP3(a, b, c) b
0144
0145 /*
0146  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
0147  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
0148  * It was then edited to use QT_TRANSLATE_NOOP3 instead of I18N_NOOP.
0149  */
0150
0151 static const char language_for_encoding_string[] =
0152     "ISO 8859-1\0"
0153     QT_TRANSLATE_NOOP3("KCharsets", "Western European", "@item Text character set")"\0"
0154     "ISO 8859-15\0"
0155     "ISO 8859-14\0"
0156     "cp 1252\0"
0157     "IBM850\0"
0158     "ISO 8859-2\0"
0159     QT_TRANSLATE_NOOP3("KCharsets", "Central European", "@item Text character set")"\0"
0160     "ISO 8859-3\0"
0161     "ISO 8859-4\0"
0162     QT_TRANSLATE_NOOP3("KCharsets", "Baltic", "@item Text character set")"\0"
0163     "ISO 8859-13\0"
0164     "ISO 8859-16\0"
0165     QT_TRANSLATE_NOOP3("KCharsets", "South-Eastern Europe", "@item Text character set")"\0"
0166     "cp 1250\0"
0167     "cp 1254\0"
0168     QT_TRANSLATE_NOOP3("KCharsets", "Turkish", "@item Text character set")"\0"
0169     "cp 1257\0"
0170     "KOI8-R\0"
0171     QT_TRANSLATE_NOOP3("KCharsets", "Cyrillic", "@item Text character set")"\0"
0172     "ISO 8859-5\0"
0173     "cp 1251\0"
0174     "KOI8-U\0"
0175     "IBM866\0"
0176     "Big5\0"
0177     QT_TRANSLATE_NOOP3("KCharsets", "Chinese Traditional", "@item Text character set")"\0"
0178     "Big5-HKSCS\0"
0179     "GB18030\0"
0180     QT_TRANSLATE_NOOP3("KCharsets", "Chinese Simplified", "@item Text character set")"\0"
0181     "GBK\0"
0182     "GB2312\0"
0183     "EUC-KR\0"
0184     QT_TRANSLATE_NOOP3("KCharsets", "Korean", "@item Text character set")"\0"
0185     "windows-949\0"
0186     "sjis\0"
0187     QT_TRANSLATE_NOOP3("KCharsets", "Japanese", "@item Text character set")"\0"
0188     "ISO-2022-JP\0"
0189     "EUC-JP\0"
0190     "ISO 8859-7\0"
0191     QT_TRANSLATE_NOOP3("KCharsets", "Greek", "@item Text character set")"\0"
0192     "cp 1253\0"
0193     "ISO 8859-6\0"
0194     QT_TRANSLATE_NOOP3("KCharsets", "Arabic", "@item Text character set")"\0"
0195     "cp 1256\0"
0196     "ISO 8859-8\0"
0197     QT_TRANSLATE_NOOP3("KCharsets", "Hebrew", "@item Text character set")"\0"
0198     "ISO 8859-8-I\0"
0199     "cp 1255\0"
0200     "ISO 8859-9\0"
0201     "TIS620\0"
0202     QT_TRANSLATE_NOOP3("KCharsets", "Thai", "@item Text character set")"\0"
0203     "ISO 8859-11\0"
0204     "UTF-8\0"
0205     QT_TRANSLATE_NOOP3("KCharsets", "Unicode", "@item Text character set")"\0"
0206     "UTF-16\0"
0207     "utf7\0"
0208     "ucs2\0"
0209     "ISO 10646-UCS-2\0"
0210     "windows-1258\0"
0211     QT_TRANSLATE_NOOP3("KCharsets", "Other", "@item Text character set")"\0"
0212     "IBM874\0"
0213     "TSCII\0"
0214     "\0";
0215
0216 static const int language_for_encoding_indices[] = {
0217     0,   11,  28,  11,  40,  11,  52,  11,  60,  11,  67,  78,  95,  78,  106, 117, 124, 117, 136, 148, 169, 78,  177, 185, 193, 117, 201, 208, 217, 208, 228,
0218     208, 236, 208, 243, 208, 250, 255, 275, 255, 286, 294, 313, 294, 317, 294, 324, 331, 338, 331, 350, 355, 364, 355, 376, 355, 383, 394, 400, 394, 408, 419,
0219     426, 419, 434, 445, 452, 445, 465, 445, 473, 185, 484, 491, 496, 491, 508, 514, 522, 514, 529, 514, 534, 514, 539, 514, 555, 568, 574, 568, 581, 568, -1};
0220
0221 /*
0222  * GENERATED CODE ENDS HERE
0223  */
0224
0225 /*
0226  * defines some different names for codecs that are built into Qt.
0227  * The names in this list must be lower-case.
0228  * input data for generate_string_table.pl:
0229 iso-ir-111
0230 koi8-r
0231 koi unified
0232 koi8-r
0233 us-ascii
0234 iso 8859-1
0235 usascii
0236 iso 8859-1
0237 ascii
0238 iso 8859-1
0239 unicode-1-1-utf-7
0240 utf-7
0241 ucs2
0242 iso-10646-ucs-2
0243 iso10646-1
0244 iso-10646-ucs-2
0245 gb18030.2000-1
0246 gb18030
0247 gb18030.2000-0
0248 gb18030
0249 gbk-0
0250 gbk
0251 gb2312
0252 gbk
0253 gb2312.1980-0
0254 gbk
0255 big5-0
0256 big5
0257 euc-kr
0258 euckr
0259 cp 949
0260 windows-949
0261 euc-jp
0262 eucjp
0263 jisx0201.1976-0
0264 eucjp
0265 jisx0208.1983-0
0266 eucjp
0267 jisx0208.1990-0
0268 eucjp
0269 jisx0208.1997-0
0270 eucjp
0271 jisx0212.1990-0
0272 eucjp
0273 jisx0213.2000-1
0274 eucjp
0275 jisx0213.2000-2
0276 eucjp
0277 shift_jis
0278 sjis
0279 shift-jis
0280 sjis
0281 sjis
0282 sjis
0283 iso-2022-jp
0284 jis7
0285 windows850
0286 ibm850
0287 windows866
0288 ibm866
0289 windows-850
0290 ibm850
0291 windows-866
0292 ibm866
0293 cp-10000
0294 apple roman
0295 thai-tis620
0296 iso 8859-11
0297 windows-874
0298 ibm874
0299 windows874
0300 ibm874
0301 cp-874
0302 ibm874
0303 ksc5601.1987-0
0304 euckr
0305 ks_c_5601-1987
0306 euckr
0307 mac-roman
0308 apple roman
0309 macintosh
0310 apple roman
0311 mac
0312 apple roman
0313 csiso2022jp
0314 iso-2022-jp
0315 */
0316 /*
0317  * Notes about the table:
0318  * - using ISO-8859-1 for ASCII is only an approximation (as you cannot test if a character is part of the set)
0319  * - utf7 is not in Qt
0320  * - UTF-16 is duplicated as "ucs2" and "ISO 10646-UCS-2"
0321  * - sjis: appears on the table for x-sjis
0322  * - jis7: ISO-2022-JP is now the default name in Qt4
0323  * - cp-874: is it really needed?
0324  * - mac-roman: appears on the table for x-mac-roman
0325  * - csiso2022jp: See bug #77243
0326  */
0327
0328 /*
0329  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
0330  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
0331  */
0332
0333 static const char builtin_string[] =
0334     "iso-ir-111\0"
0335     "koi8-r\0"
0336     "koi unified\0"
0337     "us-ascii\0"
0338     "iso 8859-1\0"
0339     "usascii\0"
0340     "ascii\0"
0341     "unicode-1-1-utf-7\0"
0342     "utf-7\0"
0343     "ucs2\0"
0344     "iso-10646-ucs-2\0"
0345     "iso10646-1\0"
0346     "gb18030.2000-1\0"
0347     "gb18030\0"
0348     "gb18030.2000-0\0"
0349     "gbk-0\0"
0350     "gbk\0"
0351     "gb2312\0"
0352     "gb2312.1980-0\0"
0353     "big5-0\0"
0354     "big5\0"
0355     "euc-kr\0"
0356     "euckr\0"
0357     "cp 949\0"
0358     "windows-949\0"
0359     "euc-jp\0"
0360     "eucjp\0"
0361     "jisx0201.1976-0\0"
0362     "jisx0208.1983-0\0"
0363     "jisx0208.1990-0\0"
0364     "jisx0208.1997-0\0"
0365     "jisx0212.1990-0\0"
0366     "jisx0213.2000-1\0"
0367     "jisx0213.2000-2\0"
0368     "shift_jis\0"
0369     "sjis\0"
0370     "shift-jis\0"
0371     "iso-2022-jp\0"
0372     "jis7\0"
0373     "windows850\0"
0374     "ibm850\0"
0375     "windows866\0"
0376     "ibm866\0"
0377     "windows-850\0"
0378     "windows-866\0"
0379     "cp-10000\0"
0380     "apple roman\0"
0381     "thai-tis620\0"
0382     "iso 8859-11\0"
0383     "windows-874\0"
0384     "ibm874\0"
0385     "windows874\0"
0386     "cp-874\0"
0387     "ksc5601.1987-0\0"
0388     "ks_c_5601-1987\0"
0389     "mac-roman\0"
0390     "macintosh\0"
0391     "mac\0"
0392     "csiso2022jp\0"
0393     "\0";
0394
0395 static const int builtin_indices[] = {0,   11,  18,  11,  30,  39,  50,  39,  58,  39,  64,  82,  88,  93,  109, 93,  120, 135, 143, 135, 158, 164,
0396                                       168, 164, 175, 164, 189, 196, 201, 208, 214, 221, 233, 240, 246, 240, 262, 240, 278, 240, 294, 240, 310, 240,
0397                                       326, 240, 342, 240, 358, 368, 373, 368, 368, 368, 383, 395, 400, 411, 418, 429, 436, 411, 448, 429, 460, 469,
0398                                       481, 493, 505, 517, 524, 517, 535, 517, 542, 208, 557, 208, 572, 469, 582, 469, 592, 469, 596, 383, -1};
0399
0400 /*
0401  * GENERATED CODE ENDS HERE
0402  */
0403
0404 /*
0405  * some last resort hints in case the charmap file couldn't be found.
0406  * This gives at least a partial conversion and helps make things readable.
0407  *
0408  * the name used as input here is already converted to the more canonical
0409  * name as defined in the aliases array.
0410  *
0411  * Input data:
0412 cp1250
0413 iso-8859-2
0414 koi8-r
0415 iso-8859-5
0416 koi8-u
0417 koi8-r
0418 pt 154
0419 windows-1251
0420 paratype-154
0421 windows-1251
0422 pt-154
0423 windows-1251
0424  */
0425 /* Notes:
0426  * - KDE had always "CP 1251" as best fallback to PT 154. As Qt does not offer this encoding anymore, the codepage 1251 is used as fallback.
0427  */
0428
0429 /*
0430  * THE FOLLOWING CODE IS GENERATED. PLEASE DO NOT EDIT BY HAND.
0431  * The script used was generate_string_table.pl which can be found in kde-dev-scripts.
0432  */
0433
0434 static const char conversion_hints_string[] =
0435     "cp1250\0"
0436     "iso-8859-2\0"
0437     "koi8-r\0"
0438     "iso-8859-5\0"
0439     "koi8-u\0"
0440     "pt 154\0"
0441     "windows-1251\0"
0442     "paratype-154\0"
0443     "pt-154\0"
0444     "\0";
0445
0446 static const int conversion_hints_indices[] = {0, 7, 18, 25, 36, 18, 43, 50, 63, 50, 76, 50, -1};
0447
0448 /*
0449  * GENERATED CODE ENDS HERE
0450  */
0451
0452 struct KCharsetsSingletonPrivate {
0453     KCharsets instance;
0454 };
0455
0456 Q_GLOBAL_STATIC(KCharsetsSingletonPrivate, globalCharsets)
0457
0458 // search an array of items index/data, find first matching index
0459 // and return data, or return 0
0460 static inline const char *kcharsets_array_search(const char *start, const int *indices, const char *entry)
0461 {
0462     for (int i = 0; indices[i] != -1; i += 2) {
0463         if (qstrcmp(start + indices[i], entry) == 0) {
0464             return start + indices[i + 1];
0465         }
0466     }
0467     return nullptr;
0468 }
0469
0470 // --------------------------------------------------------------------------
0471
0472 KCharsets::KCharsets()
0473     : d(new KCharsetsPrivate)
0474 {
0475 }
0476
0477 KCharsets::~KCharsets() = default;
0478
0479 QChar KCharsets::fromEntity(QStringView str)
0480 {
0481     QChar res = QChar::Null;
0482
0483     if (str.isEmpty()) {
0484         return QChar::Null;
0485     }
0486
0487     int pos = 0;
0488     if (str[pos] == QLatin1Char('&')) {
0489         pos++;
0490     }
0491
0492     // Check for '&#000' or '&#x0000' sequence
0493     if (str[pos] == QLatin1Char('#') && str.length() - pos > 1) {
0494         bool ok;
0495         pos++;
0496         if (str[pos] == QLatin1Char('x') || str[pos] == QLatin1Char('X')) {
0497             pos++;
0498             // '&#x0000', hexadecimal character reference
0499             const auto tmp = str.mid(pos);
0500             res = QChar(tmp.toInt(&ok, 16));
0501         } else {
0502             //  '&#0000', decimal character reference
0503             const auto tmp = str.mid(pos);
0504             res = QChar(tmp.toInt(&ok, 10));
0505         }
0506         if (ok) {
0507             return res;
0508         } else {
0509             return QChar::Null;
0510         }
0511     }
0512
0513     const QByteArray raw(str.toLatin1());
0514     const entity *e = KCodecsEntities::kde_findEntity(raw.data(), raw.length());
0515
0516     if (!e) {
0517         // qCDebug(KCODECS_LOG) << "unknown entity " << str <<", len = " << str.length();
0518         return QChar::Null;
0519     }
0520     // qCDebug(KCODECS_LOG) << "got entity " << str << " = " << e->code;
0521
0522     return QChar(e->code);
0523 }
0524
0525 QChar KCharsets::fromEntity(QStringView str, int &len)
0526 {
0527     // entities are never longer than 8 chars... we start from
0528     // that length and work backwards...
0529     len = 8;
0530     while (len > 0) {
0531         const auto tmp = str.left(len);
0532         QChar res = fromEntity(tmp);
0533         if (res != QChar::Null) {
0534             return res;
0535         }
0536         len--;
0537     }
0538     return QChar::Null;
0539 }
0540
0541 QString KCharsets::toEntity(const QChar &ch)
0542 {
0543     return QString::asprintf("&#0x%x;", ch.unicode());
0544 }
0545
0546 QString KCharsets::resolveEntities(const QString &input)
0547 {
0548     QString text = input;
0549     const QChar *p = text.unicode();
0550     const QChar *end = p + text.length();
0551     const QChar *ampersand = nullptr;
0552     bool scanForSemicolon = false;
0553
0554     for (; p < end; ++p) {
0555         const QChar ch = *p;
0556
0557         if (ch == QLatin1Char('&')) {
0558             ampersand = p;
0559             scanForSemicolon = true;
0560             continue;
0561         }
0562
0563         if (ch != QLatin1Char(';') || scanForSemicolon == false) {
0564             continue;
0565         }
0566
0567         assert(ampersand);
0568
0569         scanForSemicolon = false;
0570
0571         const QChar *entityBegin = ampersand + 1;
0572
0573         const uint entityLength = p - entityBegin;
0574         if (entityLength == 0) {
0575             continue;
0576         }
0577
0578         const QChar entityValue = KCharsets::fromEntity(QStringView(entityBegin, entityLength));
0579         if (entityValue.isNull()) {
0580             continue;
0581         }
0582
0583         const uint ampersandPos = ampersand - text.unicode();
0584
0585         text[(int)ampersandPos] = entityValue;
0586         text.remove(ampersandPos + 1, entityLength + 1);
0587         p = text.unicode() + ampersandPos;
0588         end = text.unicode() + text.length();
0589         ampersand = nullptr;
0590     }
0591
0592     return text;
0593 }
0594
0595 QStringList KCharsets::availableEncodingNames() const
0596 {
0597     QStringList available;
0598     for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
0599         available.append(QString::fromUtf8(language_for_encoding_string + *p));
0600     }
0601     available.sort();
0602     return available;
0603 }
0604
0605 QString KCharsets::descriptionForEncoding(QStringView encoding) const
0606 {
0607     const char *lang = kcharsets_array_search(language_for_encoding_string, language_for_encoding_indices, encoding.toUtf8().data());
0608     if (lang) {
0609         return tr("%1 ( %2 )", "@item %1 character set, %2 encoding").arg(tr(lang, "@item Text character set"), encoding);
0610     } else {
0611         return tr("Other encoding (%1)", "@item").arg(encoding);
0612     }
0613 }
0614
0615 QString KCharsets::encodingForName(const QString &descriptiveName) const
0616 {
0617     const int left = descriptiveName.lastIndexOf(QLatin1Char('('));
0618
0619     if (left < 0) { // No parenthesis, so assume it is a normal encoding name
0620         return descriptiveName.trimmed();
0621     }
0622
0623     QString name(descriptiveName.mid(left + 1));
0624
0625     const int right = name.lastIndexOf(QLatin1Char(')'));
0626
0627     if (right < 0) {
0628         return name;
0629     }
0630
0631     return name.left(right).trimmed();
0632 }
0633
0634 QStringList KCharsets::descriptiveEncodingNames() const
0635 {
0636     QStringList encodings;
0637     for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
0638         const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
0639         const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
0640         encodings.append(tr("%1 ( %2 )", "@item Text encoding: %1 character set, %2 encoding").arg(description, name));
0641     }
0642     encodings.sort();
0643     return encodings;
0644 }
0645
0646 QList<QStringList> KCharsets::encodingsByScript() const
0647 {
0648     if (!d->encodingsByScript.isEmpty()) {
0649         return d->encodingsByScript;
0650     }
0651     int i;
0652     for (const int *p = language_for_encoding_indices; *p != -1; p += 2) {
0653         const QString name = QString::fromUtf8(language_for_encoding_string + p[0]);
0654         const QString description = tr(language_for_encoding_string + p[1], "@item Text character set");
0655
0656         for (i = 0; i < d->encodingsByScript.size(); ++i) {
0657             if (d->encodingsByScript.at(i).at(0) == description) {
0658                 d->encodingsByScript[i].append(name);
0659                 break;
0660             }
0661         }
0662
0663         if (i == d->encodingsByScript.size()) {
0664             d->encodingsByScript.append(QStringList() << description << name);
0665         }
0666     }
0667     return d->encodingsByScript;
0668 }
0669
0670 KCharsets *KCharsets::charsets()
0671 {
0672     return &globalCharsets()->instance;
0673 }