File indexing completed on 2024-05-12 07:54:47

0001 /*
0002     This file is part of the KDE libraries
0003     SPDX-FileCopyrightText: 2007 Daniel Laidig <>
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0008 #include "kcharselectdata_p.h"
0010 #include <QCoreApplication>
0011 #include <QFile>
0012 #include <QFutureInterface>
0013 #include <QRegularExpression>
0014 #include <QRunnable>
0015 #include <QStringList>
0016 #include <QThreadPool>
0017 #include <qendian.h>
0019 #include <../test-config.h>
0020 #include <qstandardpaths.h>
0021 #include <string.h>
0023 /* constants for hangul (de)composition, see UAX #15 */
0024 #define SBase 0xAC00
0025 #define LBase 0x1100
0026 #define VBase 0x1161
0027 #define TBase 0x11A7
0028 #define LCount 19
0029 #define VCount 21
0030 #define TCount 28
0031 #define NCount (VCount * TCount)
0032 #define SCount (LCount * NCount)
0034 class RunIndexCreation : public QFutureInterface<Index>, public QRunnable
0035 {
0036 public:
0037     RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile)
0038         : m_data(data)
0039         , m_dataFile(dataFile)
0040     {
0041     }
0043     QFuture<Index> start()
0044     {
0045         setRunnable(this);
0046         reportStarted();
0047         QFuture<Index> f = this->future();
0048         QThreadPool::globalInstance()->start(this);
0049         return f;
0050     }
0052     void run() override
0053     {
0054         Index index = m_data->createIndex(m_dataFile);
0055         reportResult(index);
0056         reportFinished(nullptr);
0057     }
0059 private:
0060     KCharSelectData *const m_data;
0061     const QByteArray m_dataFile;
0062 };
0064 // clang-format off
0065 static const char JAMO_L_TABLE[][4] = {
0066     "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
0067     "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
0068 };
0070 static const char JAMO_V_TABLE[][4] = {
0071     "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
0072     "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
0073     "YU", "EU", "YI", "I"
0074 };
0076 static const char JAMO_T_TABLE[][4] = {
0077     "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
0078     "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
0079     "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
0080 };
0081 // clang-format on
0083 bool KCharSelectData::openDataFile()
0084 {
0085     if (!dataFile.isEmpty()) {
0086         return true;
0087     } else {
0088         QFile file(QStringLiteral(":/kf6/kcharselect/kcharselect-data"));
0090         dataFile = file.readAll();
0091         file.close();
0092         if (dataFile.size() < 40) {
0093             dataFile.clear();
0094             return false;
0095         }
0096         const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0097         const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0098         const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0099         uint blocks = (offsetEnd - offsetBegin) / 4;
0100         if (blocks <= 167) { // maximum possible number of blocks in BMP
0101             // no remapping
0102             remapType = -1;
0103         } else if (blocks >= 174 && blocks <= 180) {
0104             // remapping introduced in 5.25
0105             remapType = 0;
0106         } else {
0107             // unknown remapping, abort
0108             dataFile.clear();
0109             return false;
0110         }
0111         futureIndex = (new RunIndexCreation(this, dataFile))->start();
0112         return true;
0113     }
0114 }
0116 // Temporary remapping code points <-> 16 bit database codes
0117 // See for details
0119 quint16 KCharSelectData::mapCodePointToDataBase(uint code) const
0120 {
0121     if (remapType == 0) {
0122         if (code >= 0xE000 && code <= 0xEFFF) {
0123             return 0xFFFF;
0124         }
0125         if (code >= 0xF000 && code <= 0xFFFF) {
0126             return code - 0x1000;
0127         }
0128         if (code >= 0x1F000 && code <= 0x1FFFF) {
0129             return code - 0x10000;
0130         }
0131     }
0132     if (code >= 0x10000) {
0133         return 0xFFFF;
0134     }
0135     return code;
0136 }
0138 uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const
0139 {
0140     if (remapType == 0) {
0141         if (code >= 0xE000 && code <= 0xEFFF) {
0142             return code + 0x1000;
0143         }
0144         if (code >= 0xF000) {
0145             return code + 0x10000;
0146         }
0147     }
0148     return code;
0149 }
0151 quint32 KCharSelectData::getDetailIndex(uint c) const
0152 {
0153     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0154     // Convert from little-endian, so that this code works on PPC too.
0155     //
0156     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 12);
0157     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 16);
0159     int min = 0;
0160     int mid;
0161     int max = ((offsetEnd - offsetBegin) / 27) - 1;
0163     quint16 unicode = mapCodePointToDataBase(c);
0164     if (unicode == 0xFFFF) {
0165         return 0;
0166     }
0168     static quint16 most_recent_searched;
0169     static quint32 most_recent_result;
0171     if (unicode == most_recent_searched) {
0172         return most_recent_result;
0173     }
0175     most_recent_searched = unicode;
0177     while (max >= min) {
0178         mid = (min + max) / 2;
0179         const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 27);
0180         if (unicode > midUnicode) {
0181             min = mid + 1;
0182         } else if (unicode < midUnicode) {
0183             max = mid - 1;
0184         } else {
0185             most_recent_result = offsetBegin + mid * 27;
0187             return most_recent_result;
0188         }
0189     }
0191     most_recent_result = 0;
0192     return 0;
0193 }
0195 QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base)
0196 {
0197     QString s = QString::number(code, base).toUpper();
0198     while (s.size() < length) {
0199         s.prepend(QLatin1Char('0'));
0200     }
0201     s.prepend(prefix);
0202     return s;
0203 }
0205 QList<uint> KCharSelectData::blockContents(int block)
0206 {
0207     if (!openDataFile()) {
0208         return QList<uint>();
0209     }
0211     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0212     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0213     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0215     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0217     QList<uint> res;
0219     if (block > max) {
0220         return res;
0221     }
0223     quint16 unicodeBegin = qFromLittleEndian<quint16>(data + offsetBegin + block * 4);
0224     quint16 unicodeEnd = qFromLittleEndian<quint16>(data + offsetBegin + block * 4 + 2);
0226     while (unicodeBegin < unicodeEnd) {
0227         res.append(mapDataBaseToCodePoint(unicodeBegin));
0228         unicodeBegin++;
0229     }
0230     res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be careful when unicodeEnd==0xffff
0232     return res;
0233 }
0235 QList<int> KCharSelectData::sectionContents(int section)
0236 {
0237     section -= 1;
0238     if (!openDataFile()) {
0239         return QList<int>();
0240     }
0242     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0243     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
0244     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
0246     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0248     QList<int> res;
0250     if (section > max) {
0251         return res;
0252     }
0254     for (int i = 0; i <= max; i++) {
0255         const quint16 currSection = qFromLittleEndian<quint16>(data + offsetBegin + i * 4);
0256         if (currSection == section || section < 0) {
0257             res.append(qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2));
0258         }
0259     }
0261     return res;
0262 }
0264 QStringList KCharSelectData::sectionList()
0265 {
0266     if (!openDataFile()) {
0267         return QStringList();
0268     }
0270     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0271     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
0272     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
0274     const char *data = dataFile.constData();
0275     QStringList list;
0276     quint32 i = stringBegin;
0277     list.append(QCoreApplication::translate("KCharSelectData", "All", "KCharSelect section name"));
0278     while (i < stringEnd) {
0279         list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name"));
0280         i += qstrlen(data + i) + 1;
0281     }
0283     return list;
0284 }
0286 QString KCharSelectData::block(uint c)
0287 {
0288     return blockName(blockIndex(c));
0289 }
0291 QString KCharSelectData::section(uint c)
0292 {
0293     return sectionName(sectionIndex(blockIndex(c)));
0294 }
0296 QString KCharSelectData::name(uint c)
0297 {
0298     if (!openDataFile()) {
0299         return QString();
0300     }
0302     if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) {
0303         return QCoreApplication::translate("KCharSelectData", "<noncharacter>");
0304     } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) {
0305         return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString());
0306     } else if (c >= 0xAC00 && c <= 0xD7AF) {
0307         /* compute hangul syllable name as per UAX #15 */
0308         int SIndex = c - SBase;
0309         int LIndex;
0310         int VIndex;
0311         int TIndex;
0313         if (SIndex < 0 || SIndex >= SCount) {
0314             return QString();
0315         }
0317         LIndex = SIndex / NCount;
0318         VIndex = (SIndex % NCount) / TCount;
0319         TIndex = SIndex % TCount;
0321         return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex])
0322             + QLatin1String(JAMO_T_TABLE[TIndex]);
0323     } else if (c >= 0xD800 && c <= 0xDB7F) {
0324         return QCoreApplication::translate("KCharSelectData", "<Non Private Use High Surrogate>");
0325     } else if (c >= 0xDB80 && c <= 0xDBFF) {
0326         return QCoreApplication::translate("KCharSelectData", "<Private Use High Surrogate>");
0327     } else if (c >= 0xDC00 && c <= 0xDFFF) {
0328         return QCoreApplication::translate("KCharSelectData", "<Low Surrogate>");
0329     } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
0330         return QCoreApplication::translate("KCharSelectData", "<Private Use>");
0331     } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) {
0332         return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString());
0333     }
0334     quint16 unicode = mapCodePointToDataBase(c);
0335     if (unicode == 0xFFFF) {
0336         return QLatin1String("NON-BMP-CHARACTER-") + formatCode(c, 4, QString());
0337     } else {
0338         const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0339         const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
0340         const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
0342         int min = 0;
0343         int mid;
0344         int max = ((offsetEnd - offsetBegin) / 6) - 1;
0345         QString s;
0347         while (max >= min) {
0348             mid = (min + max) / 2;
0349             const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
0350             if (unicode > midUnicode) {
0351                 min = mid + 1;
0352             } else if (unicode < midUnicode) {
0353                 max = mid - 1;
0354             } else {
0355                 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
0356                 s = QString::fromUtf8(dataFile.constData() + offset + 1);
0357                 break;
0358             }
0359         }
0361         if (s.isNull()) {
0362             return QCoreApplication::translate("KCharSelectData", "<not assigned>");
0363         } else {
0364             return s;
0365         }
0366     }
0367 }
0369 int KCharSelectData::blockIndex(uint c)
0370 {
0371     if (!openDataFile()) {
0372         return 0;
0373     }
0375     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0376     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0377     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0378     const quint16 unicode = mapCodePointToDataBase(c);
0379     if (unicode == 0xFFFF) {
0380         return 0;
0381     }
0383     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0385     int i = 0;
0387     while (unicode > qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) && i < max) {
0388         i++;
0389     }
0391     return i;
0392 }
0394 int KCharSelectData::sectionIndex(int block)
0395 {
0396     if (!openDataFile()) {
0397         return 0;
0398     }
0400     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0401     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
0402     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
0404     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0406     for (int i = 0; i <= max; i++) {
0407         if (qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) == block) {
0408             return qFromLittleEndian<quint16>(data + offsetBegin + i * 4) + 1;
0409         }
0410     }
0412     return 0;
0413 }
0415 QString KCharSelectData::blockName(int index)
0416 {
0417     if (!openDataFile()) {
0418         return QString();
0419     }
0421     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0422     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 16);
0423     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 20);
0425     quint32 i = stringBegin;
0426     int currIndex = 0;
0428     const char *data = dataFile.constData();
0429     while (i < stringEnd && currIndex < index) {
0430         i += qstrlen(data + i) + 1;
0431         currIndex++;
0432     }
0434     return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name");
0435 }
0437 QString KCharSelectData::sectionName(int index)
0438 {
0439     if (index == 0) {
0440         return QCoreApplication::translate("KCharSelectData", "All", "KCharselect unicode section name");
0441     }
0442     if (!openDataFile()) {
0443         return QString();
0444     }
0446     index -= 1;
0448     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0449     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
0450     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
0452     quint32 i = stringBegin;
0453     int currIndex = 0;
0455     const char *data = dataFile.constData();
0456     while (i < stringEnd && currIndex < index) {
0457         i += qstrlen(data + i) + 1;
0458         currIndex++;
0459     }
0461     return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name");
0462 }
0464 QStringList KCharSelectData::aliases(uint c)
0465 {
0466     if (!openDataFile()) {
0467         return QStringList();
0468     }
0469     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0470     const int detailIndex = getDetailIndex(c);
0471     if (detailIndex == 0) {
0472         return QStringList();
0473     }
0475     const quint8 count = *(quint8 *)(udata + detailIndex + 6);
0476     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 2);
0478     QStringList aliases;
0479     aliases.reserve(count);
0481     const char *data = dataFile.constData();
0482     for (int i = 0; i < count; i++) {
0483         aliases.append(QString::fromUtf8(data + offset));
0484         offset += qstrlen(data + offset) + 1;
0485     }
0486     return aliases;
0487 }
0489 QStringList KCharSelectData::notes(uint c)
0490 {
0491     if (!openDataFile()) {
0492         return QStringList();
0493     }
0494     const int detailIndex = getDetailIndex(c);
0495     if (detailIndex == 0) {
0496         return QStringList();
0497     }
0499     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0500     const quint8 count = *(quint8 *)(udata + detailIndex + 11);
0501     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 7);
0503     QStringList notes;
0504     notes.reserve(count);
0506     const char *data = dataFile.constData();
0507     for (int i = 0; i < count; i++) {
0508         notes.append(QString::fromUtf8(data + offset));
0509         offset += qstrlen(data + offset) + 1;
0510     }
0512     return notes;
0513 }
0515 QList<uint> KCharSelectData::seeAlso(uint c)
0516 {
0517     if (!openDataFile()) {
0518         return QList<uint>();
0519     }
0520     const int detailIndex = getDetailIndex(c);
0521     if (detailIndex == 0) {
0522         return QList<uint>();
0523     }
0525     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0526     const quint8 count = *(quint8 *)(udata + detailIndex + 26);
0527     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 22);
0529     QList<uint> seeAlso;
0530     seeAlso.reserve(count);
0532     for (int i = 0; i < count; i++) {
0533         seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian<quint16>(udata + offset)));
0534         offset += 2;
0535     }
0537     return seeAlso;
0538 }
0540 QStringList KCharSelectData::equivalents(uint c)
0541 {
0542     if (!openDataFile()) {
0543         return QStringList();
0544     }
0545     const int detailIndex = getDetailIndex(c);
0546     if (detailIndex == 0) {
0547         return QStringList();
0548     }
0550     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0551     const quint8 count = *(quint8 *)(udata + detailIndex + 21);
0552     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 17);
0554     QStringList equivalents;
0555     equivalents.reserve(count);
0557     const char *data = dataFile.constData();
0558     for (int i = 0; i < count; i++) {
0559         equivalents.append(QString::fromUtf8(data + offset));
0560         offset += qstrlen(data + offset) + 1;
0561     }
0563     return equivalents;
0564 }
0566 QStringList KCharSelectData::approximateEquivalents(uint c)
0567 {
0568     if (!openDataFile()) {
0569         return QStringList();
0570     }
0571     const int detailIndex = getDetailIndex(c);
0572     if (detailIndex == 0) {
0573         return QStringList();
0574     }
0576     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0577     const quint8 count = *(quint8 *)(udata + detailIndex + 16);
0578     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 12);
0580     QStringList approxEquivalents;
0581     approxEquivalents.reserve(count);
0583     const char *data = dataFile.constData();
0584     for (int i = 0; i < count; i++) {
0585         approxEquivalents.append(QString::fromUtf8(data + offset));
0586         offset += qstrlen(data + offset) + 1;
0587     }
0589     return approxEquivalents;
0590 }
0592 QList<uint> KCharSelectData::decomposition(uint c)
0593 {
0594     // for now, only decompose Hangul Syllable into Hangul Jamo
0595     uint SIndex = c - SBase;
0596     if (SIndex >= SCount) {
0597         return QList<uint>();
0598     }
0600     uint L = LBase + SIndex / NCount; // Choseong
0601     uint V = VBase + (SIndex % NCount) / TCount; // Jungseong
0602     uint T = TBase + SIndex % TCount; // Jongsung
0603     QList<uint> jamoList;
0604     jamoList.append(L);
0605     jamoList.append(V);
0606     if (T != TBase) {
0607         jamoList.append(T);
0608     }
0609     return jamoList;
0610 }
0612 QStringList KCharSelectData::unihanInfo(uint c)
0613 {
0614     if (!openDataFile()) {
0615         return QStringList();
0616     }
0618     quint16 unicode = mapCodePointToDataBase(c);
0619     if (unicode == 0xFFFF) {
0620         return QStringList();
0621     }
0623     const char *data = dataFile.constData();
0624     const uchar *udata = reinterpret_cast<const uchar *>(data);
0625     const quint32 offsetBegin = qFromLittleEndian<quint32>(udata + 36);
0626     const quint32 offsetEnd = dataFile.size();
0628     int min = 0;
0629     int mid;
0630     int max = ((offsetEnd - offsetBegin) / 30) - 1;
0632     while (max >= min) {
0633         mid = (min + max) / 2;
0634         const quint16 midUnicode = qFromLittleEndian<quint16>(udata + offsetBegin + mid * 30);
0635         if (unicode > midUnicode) {
0636             min = mid + 1;
0637         } else if (unicode < midUnicode) {
0638             max = mid - 1;
0639         } else {
0640             QStringList res;
0641             res.reserve(7);
0642             for (int i = 0; i < 7; i++) {
0643                 quint32 offset = qFromLittleEndian<quint32>(udata + offsetBegin + mid * 30 + 2 + i * 4);
0644                 if (offset != 0) {
0645                     res.append(QString::fromUtf8(data + offset));
0646                 } else {
0647                     res.append(QString());
0648                 }
0649             }
0650             return res;
0651         }
0652     }
0654     return QStringList();
0655 }
0657 QChar::Category KCharSelectData::category(uint c)
0658 {
0659     if (!openDataFile()) {
0660         return QChar::category(c);
0661     }
0663     ushort unicode = mapCodePointToDataBase(c);
0664     if (unicode == 0xFFFF) {
0665         return QChar::category(c);
0666     }
0668     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0669     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
0670     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
0672     int min = 0;
0673     int mid;
0674     int max = ((offsetEnd - offsetBegin) / 6) - 1;
0676     while (max >= min) {
0677         mid = (min + max) / 2;
0678         const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
0679         if (unicode > midUnicode) {
0680             min = mid + 1;
0681         } else if (unicode < midUnicode) {
0682             max = mid - 1;
0683         } else {
0684             quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
0685             uchar categoryCode = *(data + offset);
0686             Q_ASSERT(categoryCode > 0);
0687             categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1
0688                                See QtBase commit d17c76feee9eece4 */
0689             return QChar::Category(categoryCode);
0690         }
0691     }
0693     return QChar::category(c);
0694 }
0696 bool KCharSelectData::isPrint(uint c)
0697 {
0698     QChar::Category cat = category(c);
0699     return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned);
0700 }
0702 bool KCharSelectData::isDisplayable(uint c)
0703 {
0704     // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames.
0705     // They should be seen as non-printable characters, as trying to display them leads
0706     //  to a crash caused by a Qt "noBlockInString" assertion.
0707     if (c == 0xFDD0 || c == 0xFDD1) {
0708         return false;
0709     }
0711     return !isIgnorable(c) && isPrint(c);
0712 }
0714 bool KCharSelectData::isIgnorable(uint c)
0715 {
0716     /*
0717      * According to the Unicode standard, Default Ignorable Code Points
0718      * should be ignored unless explicitly supported. For example, U+202E
0719      * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying
0720      * it gives the undesired effect of all text being turned RTL. We do not
0721      * have a way to "explicitly" support it, so we will treat it as
0722      * non-printable.
0723      *
0724      * There is a list of these on
0725      * under the
0726      * property Default_Ignorable_Code_Point.
0727      */
0729     // NOTE: not very nice to hardcode these here; is it worth it to modify
0730     //      the binary data file to hold them?
0731     // clang-format off
0732     return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 ||
0733            c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) ||
0734            (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) ||
0735            (c >= 0x2060 && c <= 0x206F) || c == 0x3164 ||
0736            (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 ||
0737            (c >= 0xFFF0 && c <= 0xFFF8);
0738     // clang-format on
0739 }
0741 bool KCharSelectData::isCombining(uint c)
0742 {
0743     return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name");
0744     // FIXME: this is an imperfect test. There are many combining characters
0745     //       that are outside of this section. See Grapheme_Extend in
0746     //
0747 }
0749 QString KCharSelectData::display(uint c, const QFont &font)
0750 {
0751     if (!isDisplayable(c)) {
0752         return QLatin1String("<b>") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QLatin1String("</b>");
0753     } else {
0754         QString s = QLatin1String("<font size=\"+4\" face=\"") + + QLatin1String("\">");
0755         if (isCombining(c)) {
0756             s += displayCombining(c);
0757         } else {
0758             s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';');
0759         }
0760         s += QLatin1String("</font>");
0761         return s;
0762     }
0763 }
0765 QString KCharSelectData::displayCombining(uint c)
0766 {
0767     /*
0768      * The purpose of this is to make it easier to see how a combining
0769      * character affects the text around it.
0770      * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose,
0771      * as seen in pdfs from Unicode, but there seem to be a lot of alignment
0772      * problems with that.
0773      *
0774      * Eventually, it would be nice to determine whether the character
0775      * combines to the left or to the right, etc.
0776      */
0777     QString s = QLatin1String("&nbsp;&#") + QString::number(c) + QLatin1String(";&nbsp;") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)");
0778     return s;
0779 }
0781 QString KCharSelectData::categoryText(QChar::Category category)
0782 {
0783     switch (category) {
0784     case QChar::Other_Control:
0785         return QCoreApplication::translate("KCharSelectData", "Other, Control");
0786     case QChar::Other_Format:
0787         return QCoreApplication::translate("KCharSelectData", "Other, Format");
0788     case QChar::Other_NotAssigned:
0789         return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned");
0790     case QChar::Other_PrivateUse:
0791         return QCoreApplication::translate("KCharSelectData", "Other, Private Use");
0792     case QChar::Other_Surrogate:
0793         return QCoreApplication::translate("KCharSelectData", "Other, Surrogate");
0794     case QChar::Letter_Lowercase:
0795         return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase");
0796     case QChar::Letter_Modifier:
0797         return QCoreApplication::translate("KCharSelectData", "Letter, Modifier");
0798     case QChar::Letter_Other:
0799         return QCoreApplication::translate("KCharSelectData", "Letter, Other");
0800     case QChar::Letter_Titlecase:
0801         return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase");
0802     case QChar::Letter_Uppercase:
0803         return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase");
0804     case QChar::Mark_SpacingCombining:
0805         return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining");
0806     case QChar::Mark_Enclosing:
0807         return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing");
0808     case QChar::Mark_NonSpacing:
0809         return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing");
0810     case QChar::Number_DecimalDigit:
0811         return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit");
0812     case QChar::Number_Letter:
0813         return QCoreApplication::translate("KCharSelectData", "Number, Letter");
0814     case QChar::Number_Other:
0815         return QCoreApplication::translate("KCharSelectData", "Number, Other");
0816     case QChar::Punctuation_Connector:
0817         return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector");
0818     case QChar::Punctuation_Dash:
0819         return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash");
0820     case QChar::Punctuation_Close:
0821         return QCoreApplication::translate("KCharSelectData", "Punctuation, Close");
0822     case QChar::Punctuation_FinalQuote:
0823         return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote");
0824     case QChar::Punctuation_InitialQuote:
0825         return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote");
0826     case QChar::Punctuation_Other:
0827         return QCoreApplication::translate("KCharSelectData", "Punctuation, Other");
0828     case QChar::Punctuation_Open:
0829         return QCoreApplication::translate("KCharSelectData", "Punctuation, Open");
0830     case QChar::Symbol_Currency:
0831         return QCoreApplication::translate("KCharSelectData", "Symbol, Currency");
0832     case QChar::Symbol_Modifier:
0833         return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier");
0834     case QChar::Symbol_Math:
0835         return QCoreApplication::translate("KCharSelectData", "Symbol, Math");
0836     case QChar::Symbol_Other:
0837         return QCoreApplication::translate("KCharSelectData", "Symbol, Other");
0838     case QChar::Separator_Line:
0839         return QCoreApplication::translate("KCharSelectData", "Separator, Line");
0840     case QChar::Separator_Paragraph:
0841         return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph");
0842     case QChar::Separator_Space:
0843         return QCoreApplication::translate("KCharSelectData", "Separator, Space");
0844     default:
0845         return QCoreApplication::translate("KCharSelectData", "Unknown");
0846     }
0847 }
0849 QList<uint> KCharSelectData::find(const QString &needle)
0850 {
0851     QSet<uint> result;
0853     QList<uint> returnRes;
0854     QString simplified = needle.length() > 1 ? needle.simplified() : needle;
0855     QStringList searchStrings;
0857     static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$"));
0858     if (octalExp.match(simplified).hasMatch()) {
0859         // search for C octal escaped UTF-8
0860         QByteArray utf8;
0861         int byte = -1;
0862         for (int i = 0; i <= simplified.length(); ++i) {
0863             int c =;
0864             if (c >= '0' && c <= '7') {
0865                 byte = 8 * byte + c - '0';
0866             } else if (byte == -1) {
0867                 byte = 0;
0868             } else if (byte >= 0x00 && byte <= 0xFF) {
0869                 utf8.append((char)byte);
0870                 byte = 0;
0871             }
0872         }
0873         simplified = QString::fromUtf8(utf8);
0874     }
0876     if (simplified.length() <= 2) {
0877         QList<uint> ucs4 = simplified.toUcs4();
0878         if (ucs4.size() == 1) {
0879             // search for hex representation of the character
0880             searchStrings = QStringList(formatCode(;
0881         } else {
0882             searchStrings = splitString(simplified);
0883         }
0884     } else {
0885         searchStrings = splitString(simplified);
0886     }
0888     if (searchStrings.isEmpty()) {
0889         return returnRes;
0890     }
0892     static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$"));
0893     for (const QString &s : std::as_const(searchStrings)) {
0894         const QRegularExpressionMatch match = hexExp.match(s);
0895         if (match.hasMatch()) {
0896             const QString cap = match.captured(1);
0897             returnRes.append(cap.toInt(nullptr, 16));
0898             // search for "1234" instead of "0x1234"
0899             if (s.length() == 6 || s.length() == 7) {
0900                 searchStrings[searchStrings.indexOf(s)] = cap;
0901             }
0902         }
0903         // try to parse string as decimal number
0904         bool ok;
0905         int unicode = s.toInt(&ok);
0906         if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) {
0907             returnRes.append(unicode);
0908         }
0909     }
0911     bool firstSubString = true;
0912     for (const QString &s : std::as_const(searchStrings)) {
0913         QSet<uint> partResult = getMatchingChars(s.toLower());
0914         if (firstSubString) {
0915             result = partResult;
0916             firstSubString = false;
0917         } else {
0918             result = result.intersect(partResult);
0919         }
0920     }
0922     // remove results found by matching the code point to prevent duplicate results
0923     // while letting these characters stay at the beginning
0924     for (uint c : std::as_const(returnRes)) {
0925         result.remove(c);
0926     }
0928     QList<uint> sortedResult;
0929     sortedResult.reserve(result.count());
0930     for (auto c : std::as_const(result)) {
0931         sortedResult.append(c);
0932     }
0933     std::sort(sortedResult.begin(), sortedResult.end());
0935     returnRes += sortedResult;
0936     return returnRes;
0937 }
0939 QSet<uint> KCharSelectData::getMatchingChars(const QString &s)
0940 {
0941     if (dataFile.isEmpty()) {
0942         return QSet<uint>();
0943     }
0944     futureIndex.waitForFinished();
0945     const Index index = futureIndex.result();
0946     Index::const_iterator pos = index.lowerBound(s);
0947     QSet<uint> result;
0949     while (pos != index.constEnd() && pos.key().startsWith(s)) {
0950         for (quint16 c : pos.value()) {
0951             result.insert(mapDataBaseToCodePoint(c));
0952         }
0953         ++pos;
0954     }
0956     return result;
0957 }
0959 QStringList KCharSelectData::splitString(const QString &s)
0960 {
0961     QStringList result;
0962     int start = 0;
0963     int end = 0;
0964     int length = s.length();
0965     while (end < length) {
0966         while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
0967             end++;
0968         }
0969         if (start != end) {
0970             result.append(s.mid(start, end - start));
0971         }
0972         start = end;
0973         while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
0974             end++;
0975             start++;
0976         }
0977     }
0978     return result;
0979 }
0981 void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s)
0982 {
0983     const QStringList strings = splitString(s);
0984     for (const QString &s : strings) {
0985         (*index)[s.toLower()].append(unicode);
0986     }
0987 }
0989 Index KCharSelectData::createIndex(const QByteArray &dataFile)
0990 {
0991     Index i;
0993     // character names
0994     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0995     const char *data = dataFile.constData();
0996     const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(udata + 4);
0997     const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(udata + 8);
0999     int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1;
1001     for (int pos = 0; pos <= max; pos++) {
1002         const quint16 unicode = qFromLittleEndian<quint16>(udata + nameOffsetBegin + pos * 6);
1003         quint32 offset = qFromLittleEndian<quint32>(udata + nameOffsetBegin + pos * 6 + 2);
1004         appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1));
1005     }
1007     // details
1008     const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(udata + 12);
1009     const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(udata + 16);
1011     max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1;
1013     for (int pos = 0; pos <= max; pos++) {
1014         const quint16 unicode = qFromLittleEndian<quint16>(udata + detailsOffsetBegin + pos * 27);
1016         // aliases
1017         const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6);
1018         quint32 aliasOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 2);
1020         for (int j = 0; j < aliasCount; j++) {
1021             appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset));
1022             aliasOffset += qstrlen(data + aliasOffset) + 1;
1023         }
1025         // notes
1026         const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11);
1027         quint32 notesOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 7);
1029         for (int j = 0; j < notesCount; j++) {
1030             appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset));
1031             notesOffset += qstrlen(data + notesOffset) + 1;
1032         }
1034         // approximate equivalents
1035         const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16);
1036         quint32 apprOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 12);
1038         for (int j = 0; j < apprCount; j++) {
1039             appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset));
1040             apprOffset += qstrlen(data + apprOffset) + 1;
1041         }
1043         // equivalents
1044         const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21);
1045         quint32 equivOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 17);
1047         for (int j = 0; j < equivCount; j++) {
1048             appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset));
1049             equivOffset += qstrlen(data + equivOffset) + 1;
1050         }
1052         // see also - convert to string (hex)
1053         const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26);
1054         quint32 seeAlsoOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 22);
1056         for (int j = 0; j < seeAlsoCount; j++) {
1057             quint16 seeAlso = qFromLittleEndian<quint16>(udata + seeAlsoOffset);
1058             appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString()));
1059             equivOffset += qstrlen(data + equivOffset) + 1;
1060         }
1061     }
1063     // unihan data
1064     // temporary disabled due to the huge amount of data
1065     // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36);
1066     // const quint32 unihanOffsetEnd = dataFile.size();
1067     // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1;
1068     //
1069     // for (int pos = 0; pos <= max; pos++) {
1070     //     const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30);
1071     //     for(int j = 0; j < 7; j++) {
1072     //         quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4);
1073     //         if(offset != 0) {
1074     //             appendToIndex(&i, unicode, QString::fromUtf8(data + offset));
1075     //         }
1076     //     }
1077     // }
1079     return i;
1080 }