File indexing completed on 2024-04-28 15:31:58

0001 /*
0002     This file is part of the KDE libraries
0003     SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #include "kcharselectdata_p.h"
0009 
0010 #include <QCoreApplication>
0011 #include <QFile>
0012 #include <QFutureInterface>
0013 #include <QRegularExpression>
0014 #include <QRunnable>
0015 #include <QStringList>
0016 #include <QThreadPool>
0017 #include <qendian.h>
0018 
0019 #include <../test-config.h>
0020 #include <qstandardpaths.h>
0021 #include <string.h>
0022 
0023 /* constants for hangul (de)composition, see UAX #15 */
0024 #define SBase 0xAC00
0025 #define LBase 0x1100
0026 #define VBase 0x1161
0027 #define TBase 0x11A7
0028 #define LCount 19
0029 #define VCount 21
0030 #define TCount 28
0031 #define NCount (VCount * TCount)
0032 #define SCount (LCount * NCount)
0033 
0034 class RunIndexCreation : public QFutureInterface<Index>, public QRunnable
0035 {
0036 public:
0037     RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile)
0038         : m_data(data)
0039         , m_dataFile(dataFile)
0040     {
0041     }
0042 
0043     QFuture<Index> start()
0044     {
0045         setRunnable(this);
0046         reportStarted();
0047         QFuture<Index> f = this->future();
0048         QThreadPool::globalInstance()->start(this);
0049         return f;
0050     }
0051 
0052     void run() override
0053     {
0054         Index index = m_data->createIndex(m_dataFile);
0055         reportResult(index);
0056         reportFinished(nullptr);
0057     }
0058 
0059 private:
0060     KCharSelectData *const m_data;
0061     const QByteArray m_dataFile;
0062 };
0063 
0064 // clang-format off
0065 static const char JAMO_L_TABLE[][4] = {
0066     "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
0067     "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
0068 };
0069 
0070 static const char JAMO_V_TABLE[][4] = {
0071     "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
0072     "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
0073     "YU", "EU", "YI", "I"
0074 };
0075 
0076 static const char JAMO_T_TABLE[][4] = {
0077     "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
0078     "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
0079     "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
0080 };
0081 // clang-format on
0082 
0083 bool KCharSelectData::openDataFile()
0084 {
0085     if (!dataFile.isEmpty()) {
0086         return true;
0087     } else {
0088         const QString kcharselectDataPath = QStringLiteral("kf" QT_STRINGIFY(QT_VERSION_MAJOR) "/kcharselect/kcharselect-data");
0089         QString fileName = QStandardPaths::locate(QStandardPaths::GenericDataLocation, kcharselectDataPath);
0090         if (fileName.isEmpty()) {
0091             fileName = QStringLiteral(TOP_SRCDIR "/src/kcharselect-data"); // for autotests before installation
0092         }
0093         QFile file(fileName);
0094         if (!file.open(QIODevice::ReadOnly)) {
0095             qWarning() << "Couldn't find " << kcharselectDataPath << " in the install prefix (under GenericDataLocation) nor in the builtin path" << TOP_SRCDIR;
0096             return false;
0097         }
0098         dataFile = file.readAll();
0099         file.close();
0100         if (dataFile.size() < 40) {
0101             dataFile.clear();
0102             return false;
0103         }
0104         const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0105         const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0106         const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0107         uint blocks = (offsetEnd - offsetBegin) / 4;
0108         if (blocks <= 167) { // maximum possible number of blocks in BMP
0109             // no remapping
0110             remapType = -1;
0111         } else if (blocks >= 174 && blocks <= 180) {
0112             // remapping introduced in 5.25
0113             remapType = 0;
0114         } else {
0115             // unknown remapping, abort
0116             dataFile.clear();
0117             return false;
0118         }
0119         futureIndex = (new RunIndexCreation(this, dataFile))->start();
0120         return true;
0121     }
0122 }
0123 
0124 // Temporary remapping code points <-> 16 bit database codes
0125 // See kcharselect-generate-datafile.py for details
0126 
0127 quint16 KCharSelectData::mapCodePointToDataBase(uint code) const
0128 {
0129     if (remapType == 0) {
0130         if (code >= 0xE000 && code <= 0xEFFF) {
0131             return 0xFFFF;
0132         }
0133         if (code >= 0xF000 && code <= 0xFFFF) {
0134             return code - 0x1000;
0135         }
0136         if (code >= 0x1F000 && code <= 0x1FFFF) {
0137             return code - 0x10000;
0138         }
0139     }
0140     if (code >= 0x10000) {
0141         return 0xFFFF;
0142     }
0143     return code;
0144 }
0145 
0146 uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const
0147 {
0148     if (remapType == 0) {
0149         if (code >= 0xE000 && code <= 0xEFFF) {
0150             return code + 0x1000;
0151         }
0152         if (code >= 0xF000) {
0153             return code + 0x10000;
0154         }
0155     }
0156     return code;
0157 }
0158 
0159 quint32 KCharSelectData::getDetailIndex(uint c) const
0160 {
0161     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0162     // Convert from little-endian, so that this code works on PPC too.
0163     // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286
0164     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 12);
0165     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 16);
0166 
0167     int min = 0;
0168     int mid;
0169     int max = ((offsetEnd - offsetBegin) / 27) - 1;
0170 
0171     quint16 unicode = mapCodePointToDataBase(c);
0172     if (unicode == 0xFFFF) {
0173         return 0;
0174     }
0175 
0176     static quint16 most_recent_searched;
0177     static quint32 most_recent_result;
0178 
0179     if (unicode == most_recent_searched) {
0180         return most_recent_result;
0181     }
0182 
0183     most_recent_searched = unicode;
0184 
0185     while (max >= min) {
0186         mid = (min + max) / 2;
0187         const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 27);
0188         if (unicode > midUnicode) {
0189             min = mid + 1;
0190         } else if (unicode < midUnicode) {
0191             max = mid - 1;
0192         } else {
0193             most_recent_result = offsetBegin + mid * 27;
0194 
0195             return most_recent_result;
0196         }
0197     }
0198 
0199     most_recent_result = 0;
0200     return 0;
0201 }
0202 
0203 QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base)
0204 {
0205     QString s = QString::number(code, base).toUpper();
0206     while (s.size() < length) {
0207         s.prepend(QLatin1Char('0'));
0208     }
0209     s.prepend(prefix);
0210     return s;
0211 }
0212 
0213 QVector<uint> KCharSelectData::blockContents(int block)
0214 {
0215     if (!openDataFile()) {
0216         return QVector<uint>();
0217     }
0218 
0219     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0220     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0221     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0222 
0223     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0224 
0225     QVector<uint> res;
0226 
0227     if (block > max) {
0228         return res;
0229     }
0230 
0231     quint16 unicodeBegin = qFromLittleEndian<quint16>(data + offsetBegin + block * 4);
0232     quint16 unicodeEnd = qFromLittleEndian<quint16>(data + offsetBegin + block * 4 + 2);
0233 
0234     while (unicodeBegin < unicodeEnd) {
0235         res.append(mapDataBaseToCodePoint(unicodeBegin));
0236         unicodeBegin++;
0237     }
0238     res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be careful when unicodeEnd==0xffff
0239 
0240     return res;
0241 }
0242 
0243 QVector<int> KCharSelectData::sectionContents(int section)
0244 {
0245     section -= 1;
0246     if (!openDataFile()) {
0247         return QVector<int>();
0248     }
0249 
0250     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0251     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
0252     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
0253 
0254     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0255 
0256     QVector<int> res;
0257 
0258     if (section > max) {
0259         return res;
0260     }
0261 
0262     for (int i = 0; i <= max; i++) {
0263         const quint16 currSection = qFromLittleEndian<quint16>(data + offsetBegin + i * 4);
0264         if (currSection == section || section < 0) {
0265             res.append(qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2));
0266         }
0267     }
0268 
0269     return res;
0270 }
0271 
0272 QStringList KCharSelectData::sectionList()
0273 {
0274     if (!openDataFile()) {
0275         return QStringList();
0276     }
0277 
0278     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0279     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
0280     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
0281 
0282     const char *data = dataFile.constData();
0283     QStringList list;
0284     quint32 i = stringBegin;
0285     list.append(QCoreApplication::translate("KCharSelectData", "All", "KCharSelect section name"));
0286     while (i < stringEnd) {
0287         list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name"));
0288         i += qstrlen(data + i) + 1;
0289     }
0290 
0291     return list;
0292 }
0293 
0294 QString KCharSelectData::block(uint c)
0295 {
0296     return blockName(blockIndex(c));
0297 }
0298 
0299 QString KCharSelectData::section(uint c)
0300 {
0301     return sectionName(sectionIndex(blockIndex(c)));
0302 }
0303 
0304 QString KCharSelectData::name(uint c)
0305 {
0306     if (!openDataFile()) {
0307         return QString();
0308     }
0309 
0310     if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) {
0311         return QCoreApplication::translate("KCharSelectData", "<noncharacter>");
0312     } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) {
0313         return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString());
0314     } else if (c >= 0xAC00 && c <= 0xD7AF) {
0315         /* compute hangul syllable name as per UAX #15 */
0316         int SIndex = c - SBase;
0317         int LIndex;
0318         int VIndex;
0319         int TIndex;
0320 
0321         if (SIndex < 0 || SIndex >= SCount) {
0322             return QString();
0323         }
0324 
0325         LIndex = SIndex / NCount;
0326         VIndex = (SIndex % NCount) / TCount;
0327         TIndex = SIndex % TCount;
0328 
0329         return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex])
0330             + QLatin1String(JAMO_T_TABLE[TIndex]);
0331     } else if (c >= 0xD800 && c <= 0xDB7F) {
0332         return QCoreApplication::translate("KCharSelectData", "<Non Private Use High Surrogate>");
0333     } else if (c >= 0xDB80 && c <= 0xDBFF) {
0334         return QCoreApplication::translate("KCharSelectData", "<Private Use High Surrogate>");
0335     } else if (c >= 0xDC00 && c <= 0xDFFF) {
0336         return QCoreApplication::translate("KCharSelectData", "<Low Surrogate>");
0337     } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
0338         return QCoreApplication::translate("KCharSelectData", "<Private Use>");
0339     } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) {
0340         return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString());
0341     }
0342     quint16 unicode = mapCodePointToDataBase(c);
0343     if (unicode == 0xFFFF) {
0344         return QLatin1String("NON-BMP-CHARACTER-") + formatCode(c, 4, QString());
0345     } else {
0346         const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0347         const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
0348         const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
0349 
0350         int min = 0;
0351         int mid;
0352         int max = ((offsetEnd - offsetBegin) / 6) - 1;
0353         QString s;
0354 
0355         while (max >= min) {
0356             mid = (min + max) / 2;
0357             const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
0358             if (unicode > midUnicode) {
0359                 min = mid + 1;
0360             } else if (unicode < midUnicode) {
0361                 max = mid - 1;
0362             } else {
0363                 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
0364                 s = QString::fromUtf8(dataFile.constData() + offset + 1);
0365                 break;
0366             }
0367         }
0368 
0369         if (s.isNull()) {
0370             return QCoreApplication::translate("KCharSelectData", "<not assigned>");
0371         } else {
0372             return s;
0373         }
0374     }
0375 }
0376 
0377 int KCharSelectData::blockIndex(uint c)
0378 {
0379     if (!openDataFile()) {
0380         return 0;
0381     }
0382 
0383     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0384     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0385     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0386     const quint16 unicode = mapCodePointToDataBase(c);
0387     if (unicode == 0xFFFF) {
0388         return 0;
0389     }
0390 
0391     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0392 
0393     int i = 0;
0394 
0395     while (unicode > qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) && i < max) {
0396         i++;
0397     }
0398 
0399     return i;
0400 }
0401 
0402 int KCharSelectData::sectionIndex(int block)
0403 {
0404     if (!openDataFile()) {
0405         return 0;
0406     }
0407 
0408     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0409     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
0410     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
0411 
0412     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0413 
0414     for (int i = 0; i <= max; i++) {
0415         if (qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) == block) {
0416             return qFromLittleEndian<quint16>(data + offsetBegin + i * 4) + 1;
0417         }
0418     }
0419 
0420     return 0;
0421 }
0422 
0423 QString KCharSelectData::blockName(int index)
0424 {
0425     if (!openDataFile()) {
0426         return QString();
0427     }
0428 
0429     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0430     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 16);
0431     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 20);
0432 
0433     quint32 i = stringBegin;
0434     int currIndex = 0;
0435 
0436     const char *data = dataFile.constData();
0437     while (i < stringEnd && currIndex < index) {
0438         i += qstrlen(data + i) + 1;
0439         currIndex++;
0440     }
0441 
0442     return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name");
0443 }
0444 
0445 QString KCharSelectData::sectionName(int index)
0446 {
0447     if (index == 0) {
0448         return QCoreApplication::translate("KCharSelectData", "All", "KCharselect unicode section name");
0449     }
0450     if (!openDataFile()) {
0451         return QString();
0452     }
0453 
0454     index -= 1;
0455 
0456     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0457     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
0458     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
0459 
0460     quint32 i = stringBegin;
0461     int currIndex = 0;
0462 
0463     const char *data = dataFile.constData();
0464     while (i < stringEnd && currIndex < index) {
0465         i += qstrlen(data + i) + 1;
0466         currIndex++;
0467     }
0468 
0469     return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name");
0470 }
0471 
0472 QStringList KCharSelectData::aliases(uint c)
0473 {
0474     if (!openDataFile()) {
0475         return QStringList();
0476     }
0477     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0478     const int detailIndex = getDetailIndex(c);
0479     if (detailIndex == 0) {
0480         return QStringList();
0481     }
0482 
0483     const quint8 count = *(quint8 *)(udata + detailIndex + 6);
0484     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 2);
0485 
0486     QStringList aliases;
0487     aliases.reserve(count);
0488 
0489     const char *data = dataFile.constData();
0490     for (int i = 0; i < count; i++) {
0491         aliases.append(QString::fromUtf8(data + offset));
0492         offset += qstrlen(data + offset) + 1;
0493     }
0494     return aliases;
0495 }
0496 
0497 QStringList KCharSelectData::notes(uint c)
0498 {
0499     if (!openDataFile()) {
0500         return QStringList();
0501     }
0502     const int detailIndex = getDetailIndex(c);
0503     if (detailIndex == 0) {
0504         return QStringList();
0505     }
0506 
0507     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0508     const quint8 count = *(quint8 *)(udata + detailIndex + 11);
0509     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 7);
0510 
0511     QStringList notes;
0512     notes.reserve(count);
0513 
0514     const char *data = dataFile.constData();
0515     for (int i = 0; i < count; i++) {
0516         notes.append(QString::fromUtf8(data + offset));
0517         offset += qstrlen(data + offset) + 1;
0518     }
0519 
0520     return notes;
0521 }
0522 
0523 QVector<uint> KCharSelectData::seeAlso(uint c)
0524 {
0525     if (!openDataFile()) {
0526         return QVector<uint>();
0527     }
0528     const int detailIndex = getDetailIndex(c);
0529     if (detailIndex == 0) {
0530         return QVector<uint>();
0531     }
0532 
0533     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0534     const quint8 count = *(quint8 *)(udata + detailIndex + 26);
0535     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 22);
0536 
0537     QVector<uint> seeAlso;
0538     seeAlso.reserve(count);
0539 
0540     for (int i = 0; i < count; i++) {
0541         seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian<quint16>(udata + offset)));
0542         offset += 2;
0543     }
0544 
0545     return seeAlso;
0546 }
0547 
0548 QStringList KCharSelectData::equivalents(uint c)
0549 {
0550     if (!openDataFile()) {
0551         return QStringList();
0552     }
0553     const int detailIndex = getDetailIndex(c);
0554     if (detailIndex == 0) {
0555         return QStringList();
0556     }
0557 
0558     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0559     const quint8 count = *(quint8 *)(udata + detailIndex + 21);
0560     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 17);
0561 
0562     QStringList equivalents;
0563     equivalents.reserve(count);
0564 
0565     const char *data = dataFile.constData();
0566     for (int i = 0; i < count; i++) {
0567         equivalents.append(QString::fromUtf8(data + offset));
0568         offset += qstrlen(data + offset) + 1;
0569     }
0570 
0571     return equivalents;
0572 }
0573 
0574 QStringList KCharSelectData::approximateEquivalents(uint c)
0575 {
0576     if (!openDataFile()) {
0577         return QStringList();
0578     }
0579     const int detailIndex = getDetailIndex(c);
0580     if (detailIndex == 0) {
0581         return QStringList();
0582     }
0583 
0584     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0585     const quint8 count = *(quint8 *)(udata + detailIndex + 16);
0586     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 12);
0587 
0588     QStringList approxEquivalents;
0589     approxEquivalents.reserve(count);
0590 
0591     const char *data = dataFile.constData();
0592     for (int i = 0; i < count; i++) {
0593         approxEquivalents.append(QString::fromUtf8(data + offset));
0594         offset += qstrlen(data + offset) + 1;
0595     }
0596 
0597     return approxEquivalents;
0598 }
0599 
0600 QVector<uint> KCharSelectData::decomposition(uint c)
0601 {
0602     // for now, only decompose Hangul Syllable into Hangul Jamo
0603     uint SIndex = c - SBase;
0604     if (SIndex >= SCount) {
0605         return QVector<uint>();
0606     }
0607 
0608     uint L = LBase + SIndex / NCount; // Choseong
0609     uint V = VBase + (SIndex % NCount) / TCount; // Jungseong
0610     uint T = TBase + SIndex % TCount; // Jongsung
0611     QVector<uint> jamoList;
0612     jamoList.append(L);
0613     jamoList.append(V);
0614     if (T != TBase) {
0615         jamoList.append(T);
0616     }
0617     return jamoList;
0618 }
0619 
0620 QStringList KCharSelectData::unihanInfo(uint c)
0621 {
0622     if (!openDataFile()) {
0623         return QStringList();
0624     }
0625 
0626     quint16 unicode = mapCodePointToDataBase(c);
0627     if (unicode == 0xFFFF) {
0628         return QStringList();
0629     }
0630 
0631     const char *data = dataFile.constData();
0632     const uchar *udata = reinterpret_cast<const uchar *>(data);
0633     const quint32 offsetBegin = qFromLittleEndian<quint32>(udata + 36);
0634     const quint32 offsetEnd = dataFile.size();
0635 
0636     int min = 0;
0637     int mid;
0638     int max = ((offsetEnd - offsetBegin) / 30) - 1;
0639 
0640     while (max >= min) {
0641         mid = (min + max) / 2;
0642         const quint16 midUnicode = qFromLittleEndian<quint16>(udata + offsetBegin + mid * 30);
0643         if (unicode > midUnicode) {
0644             min = mid + 1;
0645         } else if (unicode < midUnicode) {
0646             max = mid - 1;
0647         } else {
0648             QStringList res;
0649             res.reserve(7);
0650             for (int i = 0; i < 7; i++) {
0651                 quint32 offset = qFromLittleEndian<quint32>(udata + offsetBegin + mid * 30 + 2 + i * 4);
0652                 if (offset != 0) {
0653                     res.append(QString::fromUtf8(data + offset));
0654                 } else {
0655                     res.append(QString());
0656                 }
0657             }
0658             return res;
0659         }
0660     }
0661 
0662     return QStringList();
0663 }
0664 
0665 QChar::Category KCharSelectData::category(uint c)
0666 {
0667     if (!openDataFile()) {
0668         return QChar::category(c);
0669     }
0670 
0671     ushort unicode = mapCodePointToDataBase(c);
0672     if (unicode == 0xFFFF) {
0673         return QChar::category(c);
0674     }
0675 
0676     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0677     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
0678     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
0679 
0680     int min = 0;
0681     int mid;
0682     int max = ((offsetEnd - offsetBegin) / 6) - 1;
0683 
0684     while (max >= min) {
0685         mid = (min + max) / 2;
0686         const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
0687         if (unicode > midUnicode) {
0688             min = mid + 1;
0689         } else if (unicode < midUnicode) {
0690             max = mid - 1;
0691         } else {
0692             quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
0693             uchar categoryCode = *(data + offset);
0694             Q_ASSERT(categoryCode > 0);
0695             categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1
0696                                See QtBase commit d17c76feee9eece4 */
0697             return QChar::Category(categoryCode);
0698         }
0699     }
0700 
0701     return QChar::category(c);
0702 }
0703 
0704 bool KCharSelectData::isPrint(uint c)
0705 {
0706     QChar::Category cat = category(c);
0707     return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned);
0708 }
0709 
0710 bool KCharSelectData::isDisplayable(uint c)
0711 {
0712     // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames.
0713     // They should be seen as non-printable characters, as trying to display them leads
0714     //  to a crash caused by a Qt "noBlockInString" assertion.
0715     if (c == 0xFDD0 || c == 0xFDD1) {
0716         return false;
0717     }
0718 
0719     return !isIgnorable(c) && isPrint(c);
0720 }
0721 
0722 bool KCharSelectData::isIgnorable(uint c)
0723 {
0724     /*
0725      * According to the Unicode standard, Default Ignorable Code Points
0726      * should be ignored unless explicitly supported. For example, U+202E
0727      * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying
0728      * it gives the undesired effect of all text being turned RTL. We do not
0729      * have a way to "explicitly" support it, so we will treat it as
0730      * non-printable.
0731      *
0732      * There is a list of these on
0733      * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the
0734      * property Default_Ignorable_Code_Point.
0735      */
0736 
0737     // NOTE: not very nice to hardcode these here; is it worth it to modify
0738     //      the binary data file to hold them?
0739     // clang-format off
0740     return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 ||
0741            c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) ||
0742            (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) ||
0743            (c >= 0x2060 && c <= 0x206F) || c == 0x3164 ||
0744            (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 ||
0745            (c >= 0xFFF0 && c <= 0xFFF8);
0746     // clang-format on
0747 }
0748 
0749 bool KCharSelectData::isCombining(uint c)
0750 {
0751     return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name");
0752     // FIXME: this is an imperfect test. There are many combining characters
0753     //       that are outside of this section. See Grapheme_Extend in
0754     //       http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
0755 }
0756 
0757 QString KCharSelectData::display(uint c, const QFont &font)
0758 {
0759     if (!isDisplayable(c)) {
0760         return QLatin1String("<b>") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QLatin1String("</b>");
0761     } else {
0762         QString s = QLatin1String("<font size=\"+4\" face=\"") + font.family() + QLatin1String("\">");
0763         if (isCombining(c)) {
0764             s += displayCombining(c);
0765         } else {
0766             s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';');
0767         }
0768         s += QLatin1String("</font>");
0769         return s;
0770     }
0771 }
0772 
0773 QString KCharSelectData::displayCombining(uint c)
0774 {
0775     /*
0776      * The purpose of this is to make it easier to see how a combining
0777      * character affects the text around it.
0778      * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose,
0779      * as seen in pdfs from Unicode, but there seem to be a lot of alignment
0780      * problems with that.
0781      *
0782      * Eventually, it would be nice to determine whether the character
0783      * combines to the left or to the right, etc.
0784      */
0785     QString s = QLatin1String("&nbsp;&#") + QString::number(c) + QLatin1String(";&nbsp;") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)");
0786     return s;
0787 }
0788 
0789 QString KCharSelectData::categoryText(QChar::Category category)
0790 {
0791     switch (category) {
0792     case QChar::Other_Control:
0793         return QCoreApplication::translate("KCharSelectData", "Other, Control");
0794     case QChar::Other_Format:
0795         return QCoreApplication::translate("KCharSelectData", "Other, Format");
0796     case QChar::Other_NotAssigned:
0797         return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned");
0798     case QChar::Other_PrivateUse:
0799         return QCoreApplication::translate("KCharSelectData", "Other, Private Use");
0800     case QChar::Other_Surrogate:
0801         return QCoreApplication::translate("KCharSelectData", "Other, Surrogate");
0802     case QChar::Letter_Lowercase:
0803         return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase");
0804     case QChar::Letter_Modifier:
0805         return QCoreApplication::translate("KCharSelectData", "Letter, Modifier");
0806     case QChar::Letter_Other:
0807         return QCoreApplication::translate("KCharSelectData", "Letter, Other");
0808     case QChar::Letter_Titlecase:
0809         return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase");
0810     case QChar::Letter_Uppercase:
0811         return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase");
0812     case QChar::Mark_SpacingCombining:
0813         return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining");
0814     case QChar::Mark_Enclosing:
0815         return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing");
0816     case QChar::Mark_NonSpacing:
0817         return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing");
0818     case QChar::Number_DecimalDigit:
0819         return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit");
0820     case QChar::Number_Letter:
0821         return QCoreApplication::translate("KCharSelectData", "Number, Letter");
0822     case QChar::Number_Other:
0823         return QCoreApplication::translate("KCharSelectData", "Number, Other");
0824     case QChar::Punctuation_Connector:
0825         return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector");
0826     case QChar::Punctuation_Dash:
0827         return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash");
0828     case QChar::Punctuation_Close:
0829         return QCoreApplication::translate("KCharSelectData", "Punctuation, Close");
0830     case QChar::Punctuation_FinalQuote:
0831         return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote");
0832     case QChar::Punctuation_InitialQuote:
0833         return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote");
0834     case QChar::Punctuation_Other:
0835         return QCoreApplication::translate("KCharSelectData", "Punctuation, Other");
0836     case QChar::Punctuation_Open:
0837         return QCoreApplication::translate("KCharSelectData", "Punctuation, Open");
0838     case QChar::Symbol_Currency:
0839         return QCoreApplication::translate("KCharSelectData", "Symbol, Currency");
0840     case QChar::Symbol_Modifier:
0841         return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier");
0842     case QChar::Symbol_Math:
0843         return QCoreApplication::translate("KCharSelectData", "Symbol, Math");
0844     case QChar::Symbol_Other:
0845         return QCoreApplication::translate("KCharSelectData", "Symbol, Other");
0846     case QChar::Separator_Line:
0847         return QCoreApplication::translate("KCharSelectData", "Separator, Line");
0848     case QChar::Separator_Paragraph:
0849         return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph");
0850     case QChar::Separator_Space:
0851         return QCoreApplication::translate("KCharSelectData", "Separator, Space");
0852     default:
0853         return QCoreApplication::translate("KCharSelectData", "Unknown");
0854     }
0855 }
0856 
0857 QVector<uint> KCharSelectData::find(const QString &needle)
0858 {
0859     QSet<uint> result;
0860 
0861     QVector<uint> returnRes;
0862     QString simplified = needle.length() > 1 ? needle.simplified() : needle;
0863     QStringList searchStrings;
0864 
0865     static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$"));
0866     if (octalExp.match(simplified).hasMatch()) {
0867         // search for C octal escaped UTF-8
0868         QByteArray utf8;
0869         int byte = -1;
0870         for (int i = 0; i <= simplified.length(); ++i) {
0871             int c = simplified.at(i).unicode();
0872             if (c >= '0' && c <= '7') {
0873                 byte = 8 * byte + c - '0';
0874             } else if (byte == -1) {
0875                 byte = 0;
0876             } else if (byte >= 0x00 && byte <= 0xFF) {
0877                 utf8.append((char)byte);
0878                 byte = 0;
0879             }
0880         }
0881         simplified = QString::fromUtf8(utf8);
0882     }
0883 
0884     if (simplified.length() <= 2) {
0885         QVector<uint> ucs4 = simplified.toUcs4();
0886         if (ucs4.size() == 1) {
0887             // search for hex representation of the character
0888             searchStrings = QStringList(formatCode(ucs4.at(0)));
0889         } else {
0890             searchStrings = splitString(simplified);
0891         }
0892     } else {
0893         searchStrings = splitString(simplified);
0894     }
0895 
0896     if (searchStrings.isEmpty()) {
0897         return returnRes;
0898     }
0899 
0900     static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$"));
0901     for (const QString &s : std::as_const(searchStrings)) {
0902         const QRegularExpressionMatch match = hexExp.match(s);
0903         if (match.hasMatch()) {
0904             const QString cap = match.captured(1);
0905             returnRes.append(cap.toInt(nullptr, 16));
0906             // search for "1234" instead of "0x1234"
0907             if (s.length() == 6 || s.length() == 7) {
0908                 searchStrings[searchStrings.indexOf(s)] = cap;
0909             }
0910         }
0911         // try to parse string as decimal number
0912         bool ok;
0913         int unicode = s.toInt(&ok);
0914         if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) {
0915             returnRes.append(unicode);
0916         }
0917     }
0918 
0919     bool firstSubString = true;
0920     for (const QString &s : std::as_const(searchStrings)) {
0921         QSet<uint> partResult = getMatchingChars(s.toLower());
0922         if (firstSubString) {
0923             result = partResult;
0924             firstSubString = false;
0925         } else {
0926             result = result.intersect(partResult);
0927         }
0928     }
0929 
0930     // remove results found by matching the code point to prevent duplicate results
0931     // while letting these characters stay at the beginning
0932     for (uint c : std::as_const(returnRes)) {
0933         result.remove(c);
0934     }
0935 
0936     QVector<uint> sortedResult;
0937     sortedResult.reserve(result.count());
0938     for (auto c : std::as_const(result)) {
0939         sortedResult.append(c);
0940     }
0941     std::sort(sortedResult.begin(), sortedResult.end());
0942 
0943     returnRes += sortedResult;
0944     return returnRes;
0945 }
0946 
0947 QSet<uint> KCharSelectData::getMatchingChars(const QString &s)
0948 {
0949     if (dataFile.isEmpty()) {
0950         return QSet<uint>();
0951     }
0952     futureIndex.waitForFinished();
0953     const Index index = futureIndex.result();
0954     Index::const_iterator pos = index.lowerBound(s);
0955     QSet<uint> result;
0956 
0957     while (pos != index.constEnd() && pos.key().startsWith(s)) {
0958         for (quint16 c : pos.value()) {
0959             result.insert(mapDataBaseToCodePoint(c));
0960         }
0961         ++pos;
0962     }
0963 
0964     return result;
0965 }
0966 
0967 QStringList KCharSelectData::splitString(const QString &s)
0968 {
0969     QStringList result;
0970     int start = 0;
0971     int end = 0;
0972     int length = s.length();
0973     while (end < length) {
0974         while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
0975             end++;
0976         }
0977         if (start != end) {
0978             result.append(s.mid(start, end - start));
0979         }
0980         start = end;
0981         while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
0982             end++;
0983             start++;
0984         }
0985     }
0986     return result;
0987 }
0988 
0989 void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s)
0990 {
0991     const QStringList strings = splitString(s);
0992     for (const QString &s : strings) {
0993         (*index)[s.toLower()].append(unicode);
0994     }
0995 }
0996 
0997 Index KCharSelectData::createIndex(const QByteArray &dataFile)
0998 {
0999     Index i;
1000 
1001     // character names
1002     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
1003     const char *data = dataFile.constData();
1004     const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(udata + 4);
1005     const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(udata + 8);
1006 
1007     int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1;
1008 
1009     for (int pos = 0; pos <= max; pos++) {
1010         const quint16 unicode = qFromLittleEndian<quint16>(udata + nameOffsetBegin + pos * 6);
1011         quint32 offset = qFromLittleEndian<quint32>(udata + nameOffsetBegin + pos * 6 + 2);
1012         appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1));
1013     }
1014 
1015     // details
1016     const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(udata + 12);
1017     const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(udata + 16);
1018 
1019     max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1;
1020 
1021     for (int pos = 0; pos <= max; pos++) {
1022         const quint16 unicode = qFromLittleEndian<quint16>(udata + detailsOffsetBegin + pos * 27);
1023 
1024         // aliases
1025         const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6);
1026         quint32 aliasOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 2);
1027 
1028         for (int j = 0; j < aliasCount; j++) {
1029             appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset));
1030             aliasOffset += qstrlen(data + aliasOffset) + 1;
1031         }
1032 
1033         // notes
1034         const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11);
1035         quint32 notesOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 7);
1036 
1037         for (int j = 0; j < notesCount; j++) {
1038             appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset));
1039             notesOffset += qstrlen(data + notesOffset) + 1;
1040         }
1041 
1042         // approximate equivalents
1043         const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16);
1044         quint32 apprOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 12);
1045 
1046         for (int j = 0; j < apprCount; j++) {
1047             appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset));
1048             apprOffset += qstrlen(data + apprOffset) + 1;
1049         }
1050 
1051         // equivalents
1052         const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21);
1053         quint32 equivOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 17);
1054 
1055         for (int j = 0; j < equivCount; j++) {
1056             appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset));
1057             equivOffset += qstrlen(data + equivOffset) + 1;
1058         }
1059 
1060         // see also - convert to string (hex)
1061         const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26);
1062         quint32 seeAlsoOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 22);
1063 
1064         for (int j = 0; j < seeAlsoCount; j++) {
1065             quint16 seeAlso = qFromLittleEndian<quint16>(udata + seeAlsoOffset);
1066             appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString()));
1067             equivOffset += qstrlen(data + equivOffset) + 1;
1068         }
1069     }
1070 
1071     // unihan data
1072     // temporary disabled due to the huge amount of data
1073     // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36);
1074     // const quint32 unihanOffsetEnd = dataFile.size();
1075     // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1;
1076     //
1077     // for (int pos = 0; pos <= max; pos++) {
1078     //     const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30);
1079     //     for(int j = 0; j < 7; j++) {
1080     //         quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4);
1081     //         if(offset != 0) {
1082     //             appendToIndex(&i, unicode, QString::fromUtf8(data + offset));
1083     //         }
1084     //     }
1085     // }
1086 
1087     return i;
1088 }