File indexing completed on 2024-04-28 03:59:01

0001 /*
0002     This file is part of the KDE libraries
0003     SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
0004 
0005     SPDX-License-Identifier: LGPL-2.0-or-later
0006 */
0007 
0008 #include "kcharselectdata_p.h"
0009 
0010 #include <QCoreApplication>
0011 #include <QFile>
0012 #include <QFutureInterface>
0013 #include <QRegularExpression>
0014 #include <QRunnable>
0015 #include <QStringList>
0016 #include <QThreadPool>
0017 #include <qendian.h>
0018 
0019 #include <../test-config.h>
0020 #include <qstandardpaths.h>
0021 #include <string.h>
0022 
0023 /* constants for hangul (de)composition, see UAX #15 */
0024 #define SBase 0xAC00
0025 #define LBase 0x1100
0026 #define VBase 0x1161
0027 #define TBase 0x11A7
0028 #define LCount 19
0029 #define VCount 21
0030 #define TCount 28
0031 #define NCount (VCount * TCount)
0032 #define SCount (LCount * NCount)
0033 
0034 class RunIndexCreation : public QFutureInterface<Index>, public QRunnable
0035 {
0036 public:
0037     RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile)
0038         : m_data(data)
0039         , m_dataFile(dataFile)
0040     {
0041     }
0042 
0043     QFuture<Index> start()
0044     {
0045         setRunnable(this);
0046         reportStarted();
0047         QFuture<Index> f = this->future();
0048         QThreadPool::globalInstance()->start(this);
0049         return f;
0050     }
0051 
0052     void run() override
0053     {
0054         Index index = m_data->createIndex(m_dataFile);
0055         reportResult(index);
0056         reportFinished(nullptr);
0057     }
0058 
0059 private:
0060     KCharSelectData *const m_data;
0061     const QByteArray m_dataFile;
0062 };
0063 
0064 // clang-format off
0065 static const char JAMO_L_TABLE[][4] = {
0066     "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
0067     "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
0068 };
0069 
0070 static const char JAMO_V_TABLE[][4] = {
0071     "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
0072     "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
0073     "YU", "EU", "YI", "I"
0074 };
0075 
0076 static const char JAMO_T_TABLE[][4] = {
0077     "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
0078     "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
0079     "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
0080 };
0081 // clang-format on
0082 
0083 bool KCharSelectData::openDataFile()
0084 {
0085     if (!dataFile.isEmpty()) {
0086         return true;
0087     } else {
0088         QFile file(QStringLiteral(":/kf6/kcharselect/kcharselect-data"));
0089         file.open(QIODevice::ReadOnly);
0090         dataFile = file.readAll();
0091         file.close();
0092         if (dataFile.size() < 40) {
0093             dataFile.clear();
0094             return false;
0095         }
0096         const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0097         const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0098         const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0099         uint blocks = (offsetEnd - offsetBegin) / 4;
0100         if (blocks <= 167) { // maximum possible number of blocks in BMP
0101             // no remapping
0102             remapType = -1;
0103         } else if (blocks >= 174 && blocks <= 180) {
0104             // remapping introduced in 5.25
0105             remapType = 0;
0106         } else {
0107             // unknown remapping, abort
0108             dataFile.clear();
0109             return false;
0110         }
0111         futureIndex = (new RunIndexCreation(this, dataFile))->start();
0112         return true;
0113     }
0114 }
0115 
0116 // Temporary remapping code points <-> 16 bit database codes
0117 // See kcharselect-generate-datafile.py for details
0118 
0119 quint16 KCharSelectData::mapCodePointToDataBase(uint code) const
0120 {
0121     if (remapType == 0) {
0122         if (code >= 0xE000 && code <= 0xEFFF) {
0123             return 0xFFFF;
0124         }
0125         if (code >= 0xF000 && code <= 0xFFFF) {
0126             return code - 0x1000;
0127         }
0128         if (code >= 0x1F000 && code <= 0x1FFFF) {
0129             return code - 0x10000;
0130         }
0131     }
0132     if (code >= 0x10000) {
0133         return 0xFFFF;
0134     }
0135     return code;
0136 }
0137 
0138 uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const
0139 {
0140     if (remapType == 0) {
0141         if (code >= 0xE000 && code <= 0xEFFF) {
0142             return code + 0x1000;
0143         }
0144         if (code >= 0xF000) {
0145             return code + 0x10000;
0146         }
0147     }
0148     return code;
0149 }
0150 
0151 quint32 KCharSelectData::getDetailIndex(uint c) const
0152 {
0153     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0154     // Convert from little-endian, so that this code works on PPC too.
0155     // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286
0156     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 12);
0157     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 16);
0158 
0159     int min = 0;
0160     int mid;
0161     int max = ((offsetEnd - offsetBegin) / 27) - 1;
0162 
0163     quint16 unicode = mapCodePointToDataBase(c);
0164     if (unicode == 0xFFFF) {
0165         return 0;
0166     }
0167 
0168     static quint16 most_recent_searched;
0169     static quint32 most_recent_result;
0170 
0171     if (unicode == most_recent_searched) {
0172         return most_recent_result;
0173     }
0174 
0175     most_recent_searched = unicode;
0176 
0177     while (max >= min) {
0178         mid = (min + max) / 2;
0179         const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 27);
0180         if (unicode > midUnicode) {
0181             min = mid + 1;
0182         } else if (unicode < midUnicode) {
0183             max = mid - 1;
0184         } else {
0185             most_recent_result = offsetBegin + mid * 27;
0186 
0187             return most_recent_result;
0188         }
0189     }
0190 
0191     most_recent_result = 0;
0192     return 0;
0193 }
0194 
0195 QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base)
0196 {
0197     QString s = QString::number(code, base).toUpper();
0198     while (s.size() < length) {
0199         s.prepend(QLatin1Char('0'));
0200     }
0201     s.prepend(prefix);
0202     return s;
0203 }
0204 
0205 QList<uint> KCharSelectData::blockContents(int block)
0206 {
0207     if (!openDataFile()) {
0208         return QList<uint>();
0209     }
0210 
0211     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0212     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0213     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0214 
0215     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0216 
0217     QList<uint> res;
0218 
0219     if (block > max) {
0220         return res;
0221     }
0222 
0223     quint16 unicodeBegin = qFromLittleEndian<quint16>(data + offsetBegin + block * 4);
0224     quint16 unicodeEnd = qFromLittleEndian<quint16>(data + offsetBegin + block * 4 + 2);
0225 
0226     while (unicodeBegin < unicodeEnd) {
0227         res.append(mapDataBaseToCodePoint(unicodeBegin));
0228         unicodeBegin++;
0229     }
0230     res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be careful when unicodeEnd==0xffff
0231 
0232     return res;
0233 }
0234 
0235 QList<int> KCharSelectData::sectionContents(int section)
0236 {
0237     section -= 1;
0238     if (!openDataFile()) {
0239         return QList<int>();
0240     }
0241 
0242     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0243     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
0244     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
0245 
0246     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0247 
0248     QList<int> res;
0249 
0250     if (section > max) {
0251         return res;
0252     }
0253 
0254     for (int i = 0; i <= max; i++) {
0255         const quint16 currSection = qFromLittleEndian<quint16>(data + offsetBegin + i * 4);
0256         if (currSection == section || section < 0) {
0257             res.append(qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2));
0258         }
0259     }
0260 
0261     return res;
0262 }
0263 
0264 QStringList KCharSelectData::sectionList()
0265 {
0266     if (!openDataFile()) {
0267         return QStringList();
0268     }
0269 
0270     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0271     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
0272     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
0273 
0274     const char *data = dataFile.constData();
0275     QStringList list;
0276     quint32 i = stringBegin;
0277     list.append(QCoreApplication::translate("KCharSelectData", "All", "KCharSelect section name"));
0278     while (i < stringEnd) {
0279         list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name"));
0280         i += qstrlen(data + i) + 1;
0281     }
0282 
0283     return list;
0284 }
0285 
0286 QString KCharSelectData::block(uint c)
0287 {
0288     return blockName(blockIndex(c));
0289 }
0290 
0291 QString KCharSelectData::section(uint c)
0292 {
0293     return sectionName(sectionIndex(blockIndex(c)));
0294 }
0295 
0296 QString KCharSelectData::name(uint c)
0297 {
0298     if (!openDataFile()) {
0299         return QString();
0300     }
0301 
0302     if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) {
0303         return QCoreApplication::translate("KCharSelectData", "<noncharacter>");
0304     } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) {
0305         return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString());
0306     } else if (c >= 0xAC00 && c <= 0xD7AF) {
0307         /* compute hangul syllable name as per UAX #15 */
0308         int SIndex = c - SBase;
0309         int LIndex;
0310         int VIndex;
0311         int TIndex;
0312 
0313         if (SIndex < 0 || SIndex >= SCount) {
0314             return QString();
0315         }
0316 
0317         LIndex = SIndex / NCount;
0318         VIndex = (SIndex % NCount) / TCount;
0319         TIndex = SIndex % TCount;
0320 
0321         return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex])
0322             + QLatin1String(JAMO_T_TABLE[TIndex]);
0323     } else if (c >= 0xD800 && c <= 0xDB7F) {
0324         return QCoreApplication::translate("KCharSelectData", "<Non Private Use High Surrogate>");
0325     } else if (c >= 0xDB80 && c <= 0xDBFF) {
0326         return QCoreApplication::translate("KCharSelectData", "<Private Use High Surrogate>");
0327     } else if (c >= 0xDC00 && c <= 0xDFFF) {
0328         return QCoreApplication::translate("KCharSelectData", "<Low Surrogate>");
0329     } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
0330         return QCoreApplication::translate("KCharSelectData", "<Private Use>");
0331     } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) {
0332         return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString());
0333     }
0334     quint16 unicode = mapCodePointToDataBase(c);
0335     if (unicode == 0xFFFF) {
0336         return QLatin1String("NON-BMP-CHARACTER-") + formatCode(c, 4, QString());
0337     } else {
0338         const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0339         const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
0340         const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
0341 
0342         int min = 0;
0343         int mid;
0344         int max = ((offsetEnd - offsetBegin) / 6) - 1;
0345         QString s;
0346 
0347         while (max >= min) {
0348             mid = (min + max) / 2;
0349             const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
0350             if (unicode > midUnicode) {
0351                 min = mid + 1;
0352             } else if (unicode < midUnicode) {
0353                 max = mid - 1;
0354             } else {
0355                 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
0356                 s = QString::fromUtf8(dataFile.constData() + offset + 1);
0357                 break;
0358             }
0359         }
0360 
0361         if (s.isNull()) {
0362             return QCoreApplication::translate("KCharSelectData", "<not assigned>");
0363         } else {
0364             return s;
0365         }
0366     }
0367 }
0368 
0369 int KCharSelectData::blockIndex(uint c)
0370 {
0371     if (!openDataFile()) {
0372         return 0;
0373     }
0374 
0375     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0376     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20);
0377     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24);
0378     const quint16 unicode = mapCodePointToDataBase(c);
0379     if (unicode == 0xFFFF) {
0380         return 0;
0381     }
0382 
0383     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0384 
0385     int i = 0;
0386 
0387     while (unicode > qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) && i < max) {
0388         i++;
0389     }
0390 
0391     return i;
0392 }
0393 
0394 int KCharSelectData::sectionIndex(int block)
0395 {
0396     if (!openDataFile()) {
0397         return 0;
0398     }
0399 
0400     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0401     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28);
0402     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32);
0403 
0404     int max = ((offsetEnd - offsetBegin) / 4) - 1;
0405 
0406     for (int i = 0; i <= max; i++) {
0407         if (qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) == block) {
0408             return qFromLittleEndian<quint16>(data + offsetBegin + i * 4) + 1;
0409         }
0410     }
0411 
0412     return 0;
0413 }
0414 
0415 QString KCharSelectData::blockName(int index)
0416 {
0417     if (!openDataFile()) {
0418         return QString();
0419     }
0420 
0421     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0422     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 16);
0423     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 20);
0424 
0425     quint32 i = stringBegin;
0426     int currIndex = 0;
0427 
0428     const char *data = dataFile.constData();
0429     while (i < stringEnd && currIndex < index) {
0430         i += qstrlen(data + i) + 1;
0431         currIndex++;
0432     }
0433 
0434     return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name");
0435 }
0436 
0437 QString KCharSelectData::sectionName(int index)
0438 {
0439     if (index == 0) {
0440         return QCoreApplication::translate("KCharSelectData", "All", "KCharselect unicode section name");
0441     }
0442     if (!openDataFile()) {
0443         return QString();
0444     }
0445 
0446     index -= 1;
0447 
0448     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0449     const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24);
0450     const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28);
0451 
0452     quint32 i = stringBegin;
0453     int currIndex = 0;
0454 
0455     const char *data = dataFile.constData();
0456     while (i < stringEnd && currIndex < index) {
0457         i += qstrlen(data + i) + 1;
0458         currIndex++;
0459     }
0460 
0461     return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name");
0462 }
0463 
0464 QStringList KCharSelectData::aliases(uint c)
0465 {
0466     if (!openDataFile()) {
0467         return QStringList();
0468     }
0469     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0470     const int detailIndex = getDetailIndex(c);
0471     if (detailIndex == 0) {
0472         return QStringList();
0473     }
0474 
0475     const quint8 count = *(quint8 *)(udata + detailIndex + 6);
0476     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 2);
0477 
0478     QStringList aliases;
0479     aliases.reserve(count);
0480 
0481     const char *data = dataFile.constData();
0482     for (int i = 0; i < count; i++) {
0483         aliases.append(QString::fromUtf8(data + offset));
0484         offset += qstrlen(data + offset) + 1;
0485     }
0486     return aliases;
0487 }
0488 
0489 QStringList KCharSelectData::notes(uint c)
0490 {
0491     if (!openDataFile()) {
0492         return QStringList();
0493     }
0494     const int detailIndex = getDetailIndex(c);
0495     if (detailIndex == 0) {
0496         return QStringList();
0497     }
0498 
0499     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0500     const quint8 count = *(quint8 *)(udata + detailIndex + 11);
0501     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 7);
0502 
0503     QStringList notes;
0504     notes.reserve(count);
0505 
0506     const char *data = dataFile.constData();
0507     for (int i = 0; i < count; i++) {
0508         notes.append(QString::fromUtf8(data + offset));
0509         offset += qstrlen(data + offset) + 1;
0510     }
0511 
0512     return notes;
0513 }
0514 
0515 QList<uint> KCharSelectData::seeAlso(uint c)
0516 {
0517     if (!openDataFile()) {
0518         return QList<uint>();
0519     }
0520     const int detailIndex = getDetailIndex(c);
0521     if (detailIndex == 0) {
0522         return QList<uint>();
0523     }
0524 
0525     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0526     const quint8 count = *(quint8 *)(udata + detailIndex + 26);
0527     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 22);
0528 
0529     QList<uint> seeAlso;
0530     seeAlso.reserve(count);
0531 
0532     for (int i = 0; i < count; i++) {
0533         seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian<quint16>(udata + offset)));
0534         offset += 2;
0535     }
0536 
0537     return seeAlso;
0538 }
0539 
0540 QStringList KCharSelectData::equivalents(uint c)
0541 {
0542     if (!openDataFile()) {
0543         return QStringList();
0544     }
0545     const int detailIndex = getDetailIndex(c);
0546     if (detailIndex == 0) {
0547         return QStringList();
0548     }
0549 
0550     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0551     const quint8 count = *(quint8 *)(udata + detailIndex + 21);
0552     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 17);
0553 
0554     QStringList equivalents;
0555     equivalents.reserve(count);
0556 
0557     const char *data = dataFile.constData();
0558     for (int i = 0; i < count; i++) {
0559         equivalents.append(QString::fromUtf8(data + offset));
0560         offset += qstrlen(data + offset) + 1;
0561     }
0562 
0563     return equivalents;
0564 }
0565 
0566 QStringList KCharSelectData::approximateEquivalents(uint c)
0567 {
0568     if (!openDataFile()) {
0569         return QStringList();
0570     }
0571     const int detailIndex = getDetailIndex(c);
0572     if (detailIndex == 0) {
0573         return QStringList();
0574     }
0575 
0576     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0577     const quint8 count = *(quint8 *)(udata + detailIndex + 16);
0578     quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 12);
0579 
0580     QStringList approxEquivalents;
0581     approxEquivalents.reserve(count);
0582 
0583     const char *data = dataFile.constData();
0584     for (int i = 0; i < count; i++) {
0585         approxEquivalents.append(QString::fromUtf8(data + offset));
0586         offset += qstrlen(data + offset) + 1;
0587     }
0588 
0589     return approxEquivalents;
0590 }
0591 
0592 QList<uint> KCharSelectData::decomposition(uint c)
0593 {
0594     // for now, only decompose Hangul Syllable into Hangul Jamo
0595     uint SIndex = c - SBase;
0596     if (SIndex >= SCount) {
0597         return QList<uint>();
0598     }
0599 
0600     uint L = LBase + SIndex / NCount; // Choseong
0601     uint V = VBase + (SIndex % NCount) / TCount; // Jungseong
0602     uint T = TBase + SIndex % TCount; // Jongsung
0603     QList<uint> jamoList;
0604     jamoList.append(L);
0605     jamoList.append(V);
0606     if (T != TBase) {
0607         jamoList.append(T);
0608     }
0609     return jamoList;
0610 }
0611 
0612 QStringList KCharSelectData::unihanInfo(uint c)
0613 {
0614     if (!openDataFile()) {
0615         return QStringList();
0616     }
0617 
0618     quint16 unicode = mapCodePointToDataBase(c);
0619     if (unicode == 0xFFFF) {
0620         return QStringList();
0621     }
0622 
0623     const char *data = dataFile.constData();
0624     const uchar *udata = reinterpret_cast<const uchar *>(data);
0625     const quint32 offsetBegin = qFromLittleEndian<quint32>(udata + 36);
0626     const quint32 offsetEnd = dataFile.size();
0627 
0628     int min = 0;
0629     int mid;
0630     int max = ((offsetEnd - offsetBegin) / 30) - 1;
0631 
0632     while (max >= min) {
0633         mid = (min + max) / 2;
0634         const quint16 midUnicode = qFromLittleEndian<quint16>(udata + offsetBegin + mid * 30);
0635         if (unicode > midUnicode) {
0636             min = mid + 1;
0637         } else if (unicode < midUnicode) {
0638             max = mid - 1;
0639         } else {
0640             QStringList res;
0641             res.reserve(7);
0642             for (int i = 0; i < 7; i++) {
0643                 quint32 offset = qFromLittleEndian<quint32>(udata + offsetBegin + mid * 30 + 2 + i * 4);
0644                 if (offset != 0) {
0645                     res.append(QString::fromUtf8(data + offset));
0646                 } else {
0647                     res.append(QString());
0648                 }
0649             }
0650             return res;
0651         }
0652     }
0653 
0654     return QStringList();
0655 }
0656 
0657 QChar::Category KCharSelectData::category(uint c)
0658 {
0659     if (!openDataFile()) {
0660         return QChar::category(c);
0661     }
0662 
0663     ushort unicode = mapCodePointToDataBase(c);
0664     if (unicode == 0xFFFF) {
0665         return QChar::category(c);
0666     }
0667 
0668     const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData());
0669     const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4);
0670     const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8);
0671 
0672     int min = 0;
0673     int mid;
0674     int max = ((offsetEnd - offsetBegin) / 6) - 1;
0675 
0676     while (max >= min) {
0677         mid = (min + max) / 2;
0678         const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6);
0679         if (unicode > midUnicode) {
0680             min = mid + 1;
0681         } else if (unicode < midUnicode) {
0682             max = mid - 1;
0683         } else {
0684             quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2);
0685             uchar categoryCode = *(data + offset);
0686             Q_ASSERT(categoryCode > 0);
0687             categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1
0688                                See QtBase commit d17c76feee9eece4 */
0689             return QChar::Category(categoryCode);
0690         }
0691     }
0692 
0693     return QChar::category(c);
0694 }
0695 
0696 bool KCharSelectData::isPrint(uint c)
0697 {
0698     QChar::Category cat = category(c);
0699     return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned);
0700 }
0701 
0702 bool KCharSelectData::isDisplayable(uint c)
0703 {
0704     // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames.
0705     // They should be seen as non-printable characters, as trying to display them leads
0706     //  to a crash caused by a Qt "noBlockInString" assertion.
0707     if (c == 0xFDD0 || c == 0xFDD1) {
0708         return false;
0709     }
0710 
0711     return !isIgnorable(c) && isPrint(c);
0712 }
0713 
0714 bool KCharSelectData::isIgnorable(uint c)
0715 {
0716     /*
0717      * According to the Unicode standard, Default Ignorable Code Points
0718      * should be ignored unless explicitly supported. For example, U+202E
0719      * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying
0720      * it gives the undesired effect of all text being turned RTL. We do not
0721      * have a way to "explicitly" support it, so we will treat it as
0722      * non-printable.
0723      *
0724      * There is a list of these on
0725      * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the
0726      * property Default_Ignorable_Code_Point.
0727      */
0728 
0729     // NOTE: not very nice to hardcode these here; is it worth it to modify
0730     //      the binary data file to hold them?
0731     // clang-format off
0732     return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 ||
0733            c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) ||
0734            (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) ||
0735            (c >= 0x2060 && c <= 0x206F) || c == 0x3164 ||
0736            (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 ||
0737            (c >= 0xFFF0 && c <= 0xFFF8);
0738     // clang-format on
0739 }
0740 
0741 bool KCharSelectData::isCombining(uint c)
0742 {
0743     return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name");
0744     // FIXME: this is an imperfect test. There are many combining characters
0745     //       that are outside of this section. See Grapheme_Extend in
0746     //       http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
0747 }
0748 
0749 QString KCharSelectData::display(uint c, const QFont &font)
0750 {
0751     if (!isDisplayable(c)) {
0752         return QLatin1String("<b>") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QLatin1String("</b>");
0753     } else {
0754         QString s = QLatin1String("<font size=\"+4\" face=\"") + font.family() + QLatin1String("\">");
0755         if (isCombining(c)) {
0756             s += displayCombining(c);
0757         } else {
0758             s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';');
0759         }
0760         s += QLatin1String("</font>");
0761         return s;
0762     }
0763 }
0764 
0765 QString KCharSelectData::displayCombining(uint c)
0766 {
0767     /*
0768      * The purpose of this is to make it easier to see how a combining
0769      * character affects the text around it.
0770      * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose,
0771      * as seen in pdfs from Unicode, but there seem to be a lot of alignment
0772      * problems with that.
0773      *
0774      * Eventually, it would be nice to determine whether the character
0775      * combines to the left or to the right, etc.
0776      */
0777     QString s = QLatin1String("&nbsp;&#") + QString::number(c) + QLatin1String(";&nbsp;") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)");
0778     return s;
0779 }
0780 
0781 QString KCharSelectData::categoryText(QChar::Category category)
0782 {
0783     switch (category) {
0784     case QChar::Other_Control:
0785         return QCoreApplication::translate("KCharSelectData", "Other, Control");
0786     case QChar::Other_Format:
0787         return QCoreApplication::translate("KCharSelectData", "Other, Format");
0788     case QChar::Other_NotAssigned:
0789         return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned");
0790     case QChar::Other_PrivateUse:
0791         return QCoreApplication::translate("KCharSelectData", "Other, Private Use");
0792     case QChar::Other_Surrogate:
0793         return QCoreApplication::translate("KCharSelectData", "Other, Surrogate");
0794     case QChar::Letter_Lowercase:
0795         return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase");
0796     case QChar::Letter_Modifier:
0797         return QCoreApplication::translate("KCharSelectData", "Letter, Modifier");
0798     case QChar::Letter_Other:
0799         return QCoreApplication::translate("KCharSelectData", "Letter, Other");
0800     case QChar::Letter_Titlecase:
0801         return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase");
0802     case QChar::Letter_Uppercase:
0803         return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase");
0804     case QChar::Mark_SpacingCombining:
0805         return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining");
0806     case QChar::Mark_Enclosing:
0807         return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing");
0808     case QChar::Mark_NonSpacing:
0809         return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing");
0810     case QChar::Number_DecimalDigit:
0811         return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit");
0812     case QChar::Number_Letter:
0813         return QCoreApplication::translate("KCharSelectData", "Number, Letter");
0814     case QChar::Number_Other:
0815         return QCoreApplication::translate("KCharSelectData", "Number, Other");
0816     case QChar::Punctuation_Connector:
0817         return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector");
0818     case QChar::Punctuation_Dash:
0819         return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash");
0820     case QChar::Punctuation_Close:
0821         return QCoreApplication::translate("KCharSelectData", "Punctuation, Close");
0822     case QChar::Punctuation_FinalQuote:
0823         return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote");
0824     case QChar::Punctuation_InitialQuote:
0825         return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote");
0826     case QChar::Punctuation_Other:
0827         return QCoreApplication::translate("KCharSelectData", "Punctuation, Other");
0828     case QChar::Punctuation_Open:
0829         return QCoreApplication::translate("KCharSelectData", "Punctuation, Open");
0830     case QChar::Symbol_Currency:
0831         return QCoreApplication::translate("KCharSelectData", "Symbol, Currency");
0832     case QChar::Symbol_Modifier:
0833         return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier");
0834     case QChar::Symbol_Math:
0835         return QCoreApplication::translate("KCharSelectData", "Symbol, Math");
0836     case QChar::Symbol_Other:
0837         return QCoreApplication::translate("KCharSelectData", "Symbol, Other");
0838     case QChar::Separator_Line:
0839         return QCoreApplication::translate("KCharSelectData", "Separator, Line");
0840     case QChar::Separator_Paragraph:
0841         return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph");
0842     case QChar::Separator_Space:
0843         return QCoreApplication::translate("KCharSelectData", "Separator, Space");
0844     default:
0845         return QCoreApplication::translate("KCharSelectData", "Unknown");
0846     }
0847 }
0848 
0849 QList<uint> KCharSelectData::find(const QString &needle)
0850 {
0851     QSet<uint> result;
0852 
0853     QList<uint> returnRes;
0854     QString simplified = needle.length() > 1 ? needle.simplified() : needle;
0855     QStringList searchStrings;
0856 
0857     static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$"));
0858     if (octalExp.match(simplified).hasMatch()) {
0859         // search for C octal escaped UTF-8
0860         QByteArray utf8;
0861         int byte = -1;
0862         for (int i = 0; i <= simplified.length(); ++i) {
0863             int c = simplified.at(i).unicode();
0864             if (c >= '0' && c <= '7') {
0865                 byte = 8 * byte + c - '0';
0866             } else if (byte == -1) {
0867                 byte = 0;
0868             } else if (byte >= 0x00 && byte <= 0xFF) {
0869                 utf8.append((char)byte);
0870                 byte = 0;
0871             }
0872         }
0873         simplified = QString::fromUtf8(utf8);
0874     }
0875 
0876     if (simplified.length() <= 2) {
0877         QList<uint> ucs4 = simplified.toUcs4();
0878         if (ucs4.size() == 1) {
0879             // search for hex representation of the character
0880             searchStrings = QStringList(formatCode(ucs4.at(0)));
0881         } else {
0882             searchStrings = splitString(simplified);
0883         }
0884     } else {
0885         searchStrings = splitString(simplified);
0886     }
0887 
0888     if (searchStrings.isEmpty()) {
0889         return returnRes;
0890     }
0891 
0892     static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$"));
0893     for (const QString &s : std::as_const(searchStrings)) {
0894         const QRegularExpressionMatch match = hexExp.match(s);
0895         if (match.hasMatch()) {
0896             const QString cap = match.captured(1);
0897             returnRes.append(cap.toInt(nullptr, 16));
0898             // search for "1234" instead of "0x1234"
0899             if (s.length() == 6 || s.length() == 7) {
0900                 searchStrings[searchStrings.indexOf(s)] = cap;
0901             }
0902         }
0903         // try to parse string as decimal number
0904         bool ok;
0905         int unicode = s.toInt(&ok);
0906         if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) {
0907             returnRes.append(unicode);
0908         }
0909     }
0910 
0911     bool firstSubString = true;
0912     for (const QString &s : std::as_const(searchStrings)) {
0913         QSet<uint> partResult = getMatchingChars(s.toLower());
0914         if (firstSubString) {
0915             result = partResult;
0916             firstSubString = false;
0917         } else {
0918             result = result.intersect(partResult);
0919         }
0920     }
0921 
0922     // remove results found by matching the code point to prevent duplicate results
0923     // while letting these characters stay at the beginning
0924     for (uint c : std::as_const(returnRes)) {
0925         result.remove(c);
0926     }
0927 
0928     QList<uint> sortedResult;
0929     sortedResult.reserve(result.count());
0930     for (auto c : std::as_const(result)) {
0931         sortedResult.append(c);
0932     }
0933     std::sort(sortedResult.begin(), sortedResult.end());
0934 
0935     returnRes += sortedResult;
0936     return returnRes;
0937 }
0938 
0939 QSet<uint> KCharSelectData::getMatchingChars(const QString &s)
0940 {
0941     if (dataFile.isEmpty()) {
0942         return QSet<uint>();
0943     }
0944     futureIndex.waitForFinished();
0945     const Index index = futureIndex.result();
0946     Index::const_iterator pos = index.lowerBound(s);
0947     QSet<uint> result;
0948 
0949     while (pos != index.constEnd() && pos.key().startsWith(s)) {
0950         for (quint16 c : pos.value()) {
0951             result.insert(mapDataBaseToCodePoint(c));
0952         }
0953         ++pos;
0954     }
0955 
0956     return result;
0957 }
0958 
0959 QStringList KCharSelectData::splitString(const QString &s)
0960 {
0961     QStringList result;
0962     int start = 0;
0963     int end = 0;
0964     int length = s.length();
0965     while (end < length) {
0966         while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
0967             end++;
0968         }
0969         if (start != end) {
0970             result.append(s.mid(start, end - start));
0971         }
0972         start = end;
0973         while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) {
0974             end++;
0975             start++;
0976         }
0977     }
0978     return result;
0979 }
0980 
0981 void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s)
0982 {
0983     const QStringList strings = splitString(s);
0984     for (const QString &s : strings) {
0985         (*index)[s.toLower()].append(unicode);
0986     }
0987 }
0988 
0989 Index KCharSelectData::createIndex(const QByteArray &dataFile)
0990 {
0991     Index i;
0992 
0993     // character names
0994     const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData());
0995     const char *data = dataFile.constData();
0996     const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(udata + 4);
0997     const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(udata + 8);
0998 
0999     int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1;
1000 
1001     for (int pos = 0; pos <= max; pos++) {
1002         const quint16 unicode = qFromLittleEndian<quint16>(udata + nameOffsetBegin + pos * 6);
1003         quint32 offset = qFromLittleEndian<quint32>(udata + nameOffsetBegin + pos * 6 + 2);
1004         appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1));
1005     }
1006 
1007     // details
1008     const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(udata + 12);
1009     const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(udata + 16);
1010 
1011     max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1;
1012 
1013     for (int pos = 0; pos <= max; pos++) {
1014         const quint16 unicode = qFromLittleEndian<quint16>(udata + detailsOffsetBegin + pos * 27);
1015 
1016         // aliases
1017         const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6);
1018         quint32 aliasOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 2);
1019 
1020         for (int j = 0; j < aliasCount; j++) {
1021             appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset));
1022             aliasOffset += qstrlen(data + aliasOffset) + 1;
1023         }
1024 
1025         // notes
1026         const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11);
1027         quint32 notesOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 7);
1028 
1029         for (int j = 0; j < notesCount; j++) {
1030             appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset));
1031             notesOffset += qstrlen(data + notesOffset) + 1;
1032         }
1033 
1034         // approximate equivalents
1035         const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16);
1036         quint32 apprOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 12);
1037 
1038         for (int j = 0; j < apprCount; j++) {
1039             appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset));
1040             apprOffset += qstrlen(data + apprOffset) + 1;
1041         }
1042 
1043         // equivalents
1044         const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21);
1045         quint32 equivOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 17);
1046 
1047         for (int j = 0; j < equivCount; j++) {
1048             appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset));
1049             equivOffset += qstrlen(data + equivOffset) + 1;
1050         }
1051 
1052         // see also - convert to string (hex)
1053         const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26);
1054         quint32 seeAlsoOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 22);
1055 
1056         for (int j = 0; j < seeAlsoCount; j++) {
1057             quint16 seeAlso = qFromLittleEndian<quint16>(udata + seeAlsoOffset);
1058             appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString()));
1059             equivOffset += qstrlen(data + equivOffset) + 1;
1060         }
1061     }
1062 
1063     // unihan data
1064     // temporary disabled due to the huge amount of data
1065     // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36);
1066     // const quint32 unihanOffsetEnd = dataFile.size();
1067     // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1;
1068     //
1069     // for (int pos = 0; pos <= max; pos++) {
1070     //     const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30);
1071     //     for(int j = 0; j < 7; j++) {
1072     //         quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4);
1073     //         if(offset != 0) {
1074     //             appendToIndex(&i, unicode, QString::fromUtf8(data + offset));
1075     //         }
1076     //     }
1077     // }
1078 
1079     return i;
1080 }