Warning, file /frameworks/kwidgetsaddons/src/kcharselectdata.cpp was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 /* 0002 This file is part of the KDE libraries 0003 SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "kcharselectdata_p.h" 0009 0010 #include <QCoreApplication> 0011 #include <QFile> 0012 #include <QFutureInterface> 0013 #include <QRegularExpression> 0014 #include <QRunnable> 0015 #include <QStringList> 0016 #include <QThreadPool> 0017 #include <qendian.h> 0018 0019 #include <../test-config.h> 0020 #include <qstandardpaths.h> 0021 #include <string.h> 0022 0023 /* constants for hangul (de)composition, see UAX #15 */ 0024 #define SBase 0xAC00 0025 #define LBase 0x1100 0026 #define VBase 0x1161 0027 #define TBase 0x11A7 0028 #define LCount 19 0029 #define VCount 21 0030 #define TCount 28 0031 #define NCount (VCount * TCount) 0032 #define SCount (LCount * NCount) 0033 0034 class RunIndexCreation : public QFutureInterface<Index>, public QRunnable 0035 { 0036 public: 0037 RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile) 0038 : m_data(data) 0039 , m_dataFile(dataFile) 0040 { 0041 } 0042 0043 QFuture<Index> start() 0044 { 0045 setRunnable(this); 0046 reportStarted(); 0047 QFuture<Index> f = this->future(); 0048 QThreadPool::globalInstance()->start(this); 0049 return f; 0050 } 0051 0052 void run() override 0053 { 0054 Index index = m_data->createIndex(m_dataFile); 0055 reportResult(index); 0056 reportFinished(nullptr); 0057 } 0058 0059 private: 0060 KCharSelectData *const m_data; 0061 const QByteArray m_dataFile; 0062 }; 0063 0064 // clang-format off 0065 static const char JAMO_L_TABLE[][4] = { 0066 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", 0067 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" 0068 }; 0069 0070 static const char JAMO_V_TABLE[][4] = { 0071 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", 0072 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", 0073 "YU", "EU", "YI", "I" 0074 }; 0075 0076 static const char JAMO_T_TABLE[][4] = { 0077 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", 0078 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", 0079 "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 0080 }; 0081 // clang-format on 0082 0083 bool KCharSelectData::openDataFile() 0084 { 0085 if (!dataFile.isEmpty()) { 0086 return true; 0087 } else { 0088 const QString kcharselectDataPath = QStringLiteral("kf" QT_STRINGIFY(QT_VERSION_MAJOR) "/kcharselect/kcharselect-data"); 0089 QString fileName = QStandardPaths::locate(QStandardPaths::GenericDataLocation, kcharselectDataPath); 0090 if (fileName.isEmpty()) { 0091 fileName = QStringLiteral(TOP_SRCDIR "/src/kcharselect-data"); // for autotests before installation 0092 } 0093 QFile file(fileName); 0094 if (!file.open(QIODevice::ReadOnly)) { 0095 qWarning() << "Couldn't find " << kcharselectDataPath << " in the install prefix (under GenericDataLocation) nor in the builtin path" << TOP_SRCDIR; 0096 return false; 0097 } 0098 dataFile = file.readAll(); 0099 file.close(); 0100 if (dataFile.size() < 40) { 0101 dataFile.clear(); 0102 return false; 0103 } 0104 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0105 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20); 0106 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24); 0107 uint blocks = (offsetEnd - offsetBegin) / 4; 0108 if (blocks <= 167) { // maximum possible number of blocks in BMP 0109 // no remapping 0110 remapType = -1; 0111 } else if (blocks >= 174 && blocks <= 180) { 0112 // remapping introduced in 5.25 0113 remapType = 0; 0114 } else { 0115 // unknown remapping, abort 0116 dataFile.clear(); 0117 return false; 0118 } 0119 futureIndex = (new RunIndexCreation(this, dataFile))->start(); 0120 return true; 0121 } 0122 } 0123 0124 // Temporary remapping code points <-> 16 bit database codes 0125 // See kcharselect-generate-datafile.py for details 0126 0127 quint16 KCharSelectData::mapCodePointToDataBase(uint code) const 0128 { 0129 if (remapType == 0) { 0130 if (code >= 0xE000 && code <= 0xEFFF) { 0131 return 0xFFFF; 0132 } 0133 if (code >= 0xF000 && code <= 0xFFFF) { 0134 return code - 0x1000; 0135 } 0136 if (code >= 0x1F000 && code <= 0x1FFFF) { 0137 return code - 0x10000; 0138 } 0139 } 0140 if (code >= 0x10000) { 0141 return 0xFFFF; 0142 } 0143 return code; 0144 } 0145 0146 uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const 0147 { 0148 if (remapType == 0) { 0149 if (code >= 0xE000 && code <= 0xEFFF) { 0150 return code + 0x1000; 0151 } 0152 if (code >= 0xF000) { 0153 return code + 0x10000; 0154 } 0155 } 0156 return code; 0157 } 0158 0159 quint32 KCharSelectData::getDetailIndex(uint c) const 0160 { 0161 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0162 // Convert from little-endian, so that this code works on PPC too. 0163 // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286 0164 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 12); 0165 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 16); 0166 0167 int min = 0; 0168 int mid; 0169 int max = ((offsetEnd - offsetBegin) / 27) - 1; 0170 0171 quint16 unicode = mapCodePointToDataBase(c); 0172 if (unicode == 0xFFFF) { 0173 return 0; 0174 } 0175 0176 static quint16 most_recent_searched; 0177 static quint32 most_recent_result; 0178 0179 if (unicode == most_recent_searched) { 0180 return most_recent_result; 0181 } 0182 0183 most_recent_searched = unicode; 0184 0185 while (max >= min) { 0186 mid = (min + max) / 2; 0187 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 27); 0188 if (unicode > midUnicode) { 0189 min = mid + 1; 0190 } else if (unicode < midUnicode) { 0191 max = mid - 1; 0192 } else { 0193 most_recent_result = offsetBegin + mid * 27; 0194 0195 return most_recent_result; 0196 } 0197 } 0198 0199 most_recent_result = 0; 0200 return 0; 0201 } 0202 0203 QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base) 0204 { 0205 QString s = QString::number(code, base).toUpper(); 0206 while (s.size() < length) { 0207 s.prepend(QLatin1Char('0')); 0208 } 0209 s.prepend(prefix); 0210 return s; 0211 } 0212 0213 QVector<uint> KCharSelectData::blockContents(int block) 0214 { 0215 if (!openDataFile()) { 0216 return QVector<uint>(); 0217 } 0218 0219 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0220 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20); 0221 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24); 0222 0223 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0224 0225 QVector<uint> res; 0226 0227 if (block > max) { 0228 return res; 0229 } 0230 0231 quint16 unicodeBegin = qFromLittleEndian<quint16>(data + offsetBegin + block * 4); 0232 quint16 unicodeEnd = qFromLittleEndian<quint16>(data + offsetBegin + block * 4 + 2); 0233 0234 while (unicodeBegin < unicodeEnd) { 0235 res.append(mapDataBaseToCodePoint(unicodeBegin)); 0236 unicodeBegin++; 0237 } 0238 res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be careful when unicodeEnd==0xffff 0239 0240 return res; 0241 } 0242 0243 QVector<int> KCharSelectData::sectionContents(int section) 0244 { 0245 section -= 1; 0246 if (!openDataFile()) { 0247 return QVector<int>(); 0248 } 0249 0250 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0251 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28); 0252 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32); 0253 0254 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0255 0256 QVector<int> res; 0257 0258 if (section > max) { 0259 return res; 0260 } 0261 0262 for (int i = 0; i <= max; i++) { 0263 const quint16 currSection = qFromLittleEndian<quint16>(data + offsetBegin + i * 4); 0264 if (currSection == section || section < 0) { 0265 res.append(qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2)); 0266 } 0267 } 0268 0269 return res; 0270 } 0271 0272 QStringList KCharSelectData::sectionList() 0273 { 0274 if (!openDataFile()) { 0275 return QStringList(); 0276 } 0277 0278 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0279 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24); 0280 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28); 0281 0282 const char *data = dataFile.constData(); 0283 QStringList list; 0284 quint32 i = stringBegin; 0285 list.append(QCoreApplication::translate("KCharSelectData", "All", "KCharSelect section name")); 0286 while (i < stringEnd) { 0287 list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name")); 0288 i += qstrlen(data + i) + 1; 0289 } 0290 0291 return list; 0292 } 0293 0294 QString KCharSelectData::block(uint c) 0295 { 0296 return blockName(blockIndex(c)); 0297 } 0298 0299 QString KCharSelectData::section(uint c) 0300 { 0301 return sectionName(sectionIndex(blockIndex(c))); 0302 } 0303 0304 QString KCharSelectData::name(uint c) 0305 { 0306 if (!openDataFile()) { 0307 return QString(); 0308 } 0309 0310 if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) { 0311 return QCoreApplication::translate("KCharSelectData", "<noncharacter>"); 0312 } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) { 0313 return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString()); 0314 } else if (c >= 0xAC00 && c <= 0xD7AF) { 0315 /* compute hangul syllable name as per UAX #15 */ 0316 int SIndex = c - SBase; 0317 int LIndex; 0318 int VIndex; 0319 int TIndex; 0320 0321 if (SIndex < 0 || SIndex >= SCount) { 0322 return QString(); 0323 } 0324 0325 LIndex = SIndex / NCount; 0326 VIndex = (SIndex % NCount) / TCount; 0327 TIndex = SIndex % TCount; 0328 0329 return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex]) 0330 + QLatin1String(JAMO_T_TABLE[TIndex]); 0331 } else if (c >= 0xD800 && c <= 0xDB7F) { 0332 return QCoreApplication::translate("KCharSelectData", "<Non Private Use High Surrogate>"); 0333 } else if (c >= 0xDB80 && c <= 0xDBFF) { 0334 return QCoreApplication::translate("KCharSelectData", "<Private Use High Surrogate>"); 0335 } else if (c >= 0xDC00 && c <= 0xDFFF) { 0336 return QCoreApplication::translate("KCharSelectData", "<Low Surrogate>"); 0337 } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) { 0338 return QCoreApplication::translate("KCharSelectData", "<Private Use>"); 0339 } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) { 0340 return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString()); 0341 } 0342 quint16 unicode = mapCodePointToDataBase(c); 0343 if (unicode == 0xFFFF) { 0344 return QLatin1String("NON-BMP-CHARACTER-") + formatCode(c, 4, QString()); 0345 } else { 0346 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0347 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4); 0348 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8); 0349 0350 int min = 0; 0351 int mid; 0352 int max = ((offsetEnd - offsetBegin) / 6) - 1; 0353 QString s; 0354 0355 while (max >= min) { 0356 mid = (min + max) / 2; 0357 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6); 0358 if (unicode > midUnicode) { 0359 min = mid + 1; 0360 } else if (unicode < midUnicode) { 0361 max = mid - 1; 0362 } else { 0363 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2); 0364 s = QString::fromUtf8(dataFile.constData() + offset + 1); 0365 break; 0366 } 0367 } 0368 0369 if (s.isNull()) { 0370 return QCoreApplication::translate("KCharSelectData", "<not assigned>"); 0371 } else { 0372 return s; 0373 } 0374 } 0375 } 0376 0377 int KCharSelectData::blockIndex(uint c) 0378 { 0379 if (!openDataFile()) { 0380 return 0; 0381 } 0382 0383 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0384 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20); 0385 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24); 0386 const quint16 unicode = mapCodePointToDataBase(c); 0387 if (unicode == 0xFFFF) { 0388 return 0; 0389 } 0390 0391 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0392 0393 int i = 0; 0394 0395 while (unicode > qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) && i < max) { 0396 i++; 0397 } 0398 0399 return i; 0400 } 0401 0402 int KCharSelectData::sectionIndex(int block) 0403 { 0404 if (!openDataFile()) { 0405 return 0; 0406 } 0407 0408 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0409 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28); 0410 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32); 0411 0412 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0413 0414 for (int i = 0; i <= max; i++) { 0415 if (qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) == block) { 0416 return qFromLittleEndian<quint16>(data + offsetBegin + i * 4) + 1; 0417 } 0418 } 0419 0420 return 0; 0421 } 0422 0423 QString KCharSelectData::blockName(int index) 0424 { 0425 if (!openDataFile()) { 0426 return QString(); 0427 } 0428 0429 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0430 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 16); 0431 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 20); 0432 0433 quint32 i = stringBegin; 0434 int currIndex = 0; 0435 0436 const char *data = dataFile.constData(); 0437 while (i < stringEnd && currIndex < index) { 0438 i += qstrlen(data + i) + 1; 0439 currIndex++; 0440 } 0441 0442 return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name"); 0443 } 0444 0445 QString KCharSelectData::sectionName(int index) 0446 { 0447 if (index == 0) { 0448 return QCoreApplication::translate("KCharSelectData", "All", "KCharselect unicode section name"); 0449 } 0450 if (!openDataFile()) { 0451 return QString(); 0452 } 0453 0454 index -= 1; 0455 0456 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0457 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24); 0458 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28); 0459 0460 quint32 i = stringBegin; 0461 int currIndex = 0; 0462 0463 const char *data = dataFile.constData(); 0464 while (i < stringEnd && currIndex < index) { 0465 i += qstrlen(data + i) + 1; 0466 currIndex++; 0467 } 0468 0469 return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name"); 0470 } 0471 0472 QStringList KCharSelectData::aliases(uint c) 0473 { 0474 if (!openDataFile()) { 0475 return QStringList(); 0476 } 0477 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0478 const int detailIndex = getDetailIndex(c); 0479 if (detailIndex == 0) { 0480 return QStringList(); 0481 } 0482 0483 const quint8 count = *(quint8 *)(udata + detailIndex + 6); 0484 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 2); 0485 0486 QStringList aliases; 0487 aliases.reserve(count); 0488 0489 const char *data = dataFile.constData(); 0490 for (int i = 0; i < count; i++) { 0491 aliases.append(QString::fromUtf8(data + offset)); 0492 offset += qstrlen(data + offset) + 1; 0493 } 0494 return aliases; 0495 } 0496 0497 QStringList KCharSelectData::notes(uint c) 0498 { 0499 if (!openDataFile()) { 0500 return QStringList(); 0501 } 0502 const int detailIndex = getDetailIndex(c); 0503 if (detailIndex == 0) { 0504 return QStringList(); 0505 } 0506 0507 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0508 const quint8 count = *(quint8 *)(udata + detailIndex + 11); 0509 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 7); 0510 0511 QStringList notes; 0512 notes.reserve(count); 0513 0514 const char *data = dataFile.constData(); 0515 for (int i = 0; i < count; i++) { 0516 notes.append(QString::fromUtf8(data + offset)); 0517 offset += qstrlen(data + offset) + 1; 0518 } 0519 0520 return notes; 0521 } 0522 0523 QVector<uint> KCharSelectData::seeAlso(uint c) 0524 { 0525 if (!openDataFile()) { 0526 return QVector<uint>(); 0527 } 0528 const int detailIndex = getDetailIndex(c); 0529 if (detailIndex == 0) { 0530 return QVector<uint>(); 0531 } 0532 0533 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0534 const quint8 count = *(quint8 *)(udata + detailIndex + 26); 0535 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 22); 0536 0537 QVector<uint> seeAlso; 0538 seeAlso.reserve(count); 0539 0540 for (int i = 0; i < count; i++) { 0541 seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian<quint16>(udata + offset))); 0542 offset += 2; 0543 } 0544 0545 return seeAlso; 0546 } 0547 0548 QStringList KCharSelectData::equivalents(uint c) 0549 { 0550 if (!openDataFile()) { 0551 return QStringList(); 0552 } 0553 const int detailIndex = getDetailIndex(c); 0554 if (detailIndex == 0) { 0555 return QStringList(); 0556 } 0557 0558 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0559 const quint8 count = *(quint8 *)(udata + detailIndex + 21); 0560 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 17); 0561 0562 QStringList equivalents; 0563 equivalents.reserve(count); 0564 0565 const char *data = dataFile.constData(); 0566 for (int i = 0; i < count; i++) { 0567 equivalents.append(QString::fromUtf8(data + offset)); 0568 offset += qstrlen(data + offset) + 1; 0569 } 0570 0571 return equivalents; 0572 } 0573 0574 QStringList KCharSelectData::approximateEquivalents(uint c) 0575 { 0576 if (!openDataFile()) { 0577 return QStringList(); 0578 } 0579 const int detailIndex = getDetailIndex(c); 0580 if (detailIndex == 0) { 0581 return QStringList(); 0582 } 0583 0584 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0585 const quint8 count = *(quint8 *)(udata + detailIndex + 16); 0586 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 12); 0587 0588 QStringList approxEquivalents; 0589 approxEquivalents.reserve(count); 0590 0591 const char *data = dataFile.constData(); 0592 for (int i = 0; i < count; i++) { 0593 approxEquivalents.append(QString::fromUtf8(data + offset)); 0594 offset += qstrlen(data + offset) + 1; 0595 } 0596 0597 return approxEquivalents; 0598 } 0599 0600 QVector<uint> KCharSelectData::decomposition(uint c) 0601 { 0602 // for now, only decompose Hangul Syllable into Hangul Jamo 0603 uint SIndex = c - SBase; 0604 if (SIndex >= SCount) { 0605 return QVector<uint>(); 0606 } 0607 0608 uint L = LBase + SIndex / NCount; // Choseong 0609 uint V = VBase + (SIndex % NCount) / TCount; // Jungseong 0610 uint T = TBase + SIndex % TCount; // Jongsung 0611 QVector<uint> jamoList; 0612 jamoList.append(L); 0613 jamoList.append(V); 0614 if (T != TBase) { 0615 jamoList.append(T); 0616 } 0617 return jamoList; 0618 } 0619 0620 QStringList KCharSelectData::unihanInfo(uint c) 0621 { 0622 if (!openDataFile()) { 0623 return QStringList(); 0624 } 0625 0626 quint16 unicode = mapCodePointToDataBase(c); 0627 if (unicode == 0xFFFF) { 0628 return QStringList(); 0629 } 0630 0631 const char *data = dataFile.constData(); 0632 const uchar *udata = reinterpret_cast<const uchar *>(data); 0633 const quint32 offsetBegin = qFromLittleEndian<quint32>(udata + 36); 0634 const quint32 offsetEnd = dataFile.size(); 0635 0636 int min = 0; 0637 int mid; 0638 int max = ((offsetEnd - offsetBegin) / 30) - 1; 0639 0640 while (max >= min) { 0641 mid = (min + max) / 2; 0642 const quint16 midUnicode = qFromLittleEndian<quint16>(udata + offsetBegin + mid * 30); 0643 if (unicode > midUnicode) { 0644 min = mid + 1; 0645 } else if (unicode < midUnicode) { 0646 max = mid - 1; 0647 } else { 0648 QStringList res; 0649 res.reserve(7); 0650 for (int i = 0; i < 7; i++) { 0651 quint32 offset = qFromLittleEndian<quint32>(udata + offsetBegin + mid * 30 + 2 + i * 4); 0652 if (offset != 0) { 0653 res.append(QString::fromUtf8(data + offset)); 0654 } else { 0655 res.append(QString()); 0656 } 0657 } 0658 return res; 0659 } 0660 } 0661 0662 return QStringList(); 0663 } 0664 0665 QChar::Category KCharSelectData::category(uint c) 0666 { 0667 if (!openDataFile()) { 0668 return QChar::category(c); 0669 } 0670 0671 ushort unicode = mapCodePointToDataBase(c); 0672 if (unicode == 0xFFFF) { 0673 return QChar::category(c); 0674 } 0675 0676 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0677 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4); 0678 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8); 0679 0680 int min = 0; 0681 int mid; 0682 int max = ((offsetEnd - offsetBegin) / 6) - 1; 0683 0684 while (max >= min) { 0685 mid = (min + max) / 2; 0686 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6); 0687 if (unicode > midUnicode) { 0688 min = mid + 1; 0689 } else if (unicode < midUnicode) { 0690 max = mid - 1; 0691 } else { 0692 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2); 0693 uchar categoryCode = *(data + offset); 0694 Q_ASSERT(categoryCode > 0); 0695 categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1 0696 See QtBase commit d17c76feee9eece4 */ 0697 return QChar::Category(categoryCode); 0698 } 0699 } 0700 0701 return QChar::category(c); 0702 } 0703 0704 bool KCharSelectData::isPrint(uint c) 0705 { 0706 QChar::Category cat = category(c); 0707 return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned); 0708 } 0709 0710 bool KCharSelectData::isDisplayable(uint c) 0711 { 0712 // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames. 0713 // They should be seen as non-printable characters, as trying to display them leads 0714 // to a crash caused by a Qt "noBlockInString" assertion. 0715 if (c == 0xFDD0 || c == 0xFDD1) { 0716 return false; 0717 } 0718 0719 return !isIgnorable(c) && isPrint(c); 0720 } 0721 0722 bool KCharSelectData::isIgnorable(uint c) 0723 { 0724 /* 0725 * According to the Unicode standard, Default Ignorable Code Points 0726 * should be ignored unless explicitly supported. For example, U+202E 0727 * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying 0728 * it gives the undesired effect of all text being turned RTL. We do not 0729 * have a way to "explicitly" support it, so we will treat it as 0730 * non-printable. 0731 * 0732 * There is a list of these on 0733 * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the 0734 * property Default_Ignorable_Code_Point. 0735 */ 0736 0737 // NOTE: not very nice to hardcode these here; is it worth it to modify 0738 // the binary data file to hold them? 0739 // clang-format off 0740 return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 || 0741 c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) || 0742 (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) || 0743 (c >= 0x2060 && c <= 0x206F) || c == 0x3164 || 0744 (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 || 0745 (c >= 0xFFF0 && c <= 0xFFF8); 0746 // clang-format on 0747 } 0748 0749 bool KCharSelectData::isCombining(uint c) 0750 { 0751 return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name"); 0752 // FIXME: this is an imperfect test. There are many combining characters 0753 // that are outside of this section. See Grapheme_Extend in 0754 // http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt 0755 } 0756 0757 QString KCharSelectData::display(uint c, const QFont &font) 0758 { 0759 if (!isDisplayable(c)) { 0760 return QLatin1String("<b>") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QLatin1String("</b>"); 0761 } else { 0762 QString s = QLatin1String("<font size=\"+4\" face=\"") + font.family() + QLatin1String("\">"); 0763 if (isCombining(c)) { 0764 s += displayCombining(c); 0765 } else { 0766 s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';'); 0767 } 0768 s += QLatin1String("</font>"); 0769 return s; 0770 } 0771 } 0772 0773 QString KCharSelectData::displayCombining(uint c) 0774 { 0775 /* 0776 * The purpose of this is to make it easier to see how a combining 0777 * character affects the text around it. 0778 * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose, 0779 * as seen in pdfs from Unicode, but there seem to be a lot of alignment 0780 * problems with that. 0781 * 0782 * Eventually, it would be nice to determine whether the character 0783 * combines to the left or to the right, etc. 0784 */ 0785 QString s = QLatin1String(" &#") + QString::number(c) + QLatin1String("; ") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)"); 0786 return s; 0787 } 0788 0789 QString KCharSelectData::categoryText(QChar::Category category) 0790 { 0791 switch (category) { 0792 case QChar::Other_Control: 0793 return QCoreApplication::translate("KCharSelectData", "Other, Control"); 0794 case QChar::Other_Format: 0795 return QCoreApplication::translate("KCharSelectData", "Other, Format"); 0796 case QChar::Other_NotAssigned: 0797 return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned"); 0798 case QChar::Other_PrivateUse: 0799 return QCoreApplication::translate("KCharSelectData", "Other, Private Use"); 0800 case QChar::Other_Surrogate: 0801 return QCoreApplication::translate("KCharSelectData", "Other, Surrogate"); 0802 case QChar::Letter_Lowercase: 0803 return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase"); 0804 case QChar::Letter_Modifier: 0805 return QCoreApplication::translate("KCharSelectData", "Letter, Modifier"); 0806 case QChar::Letter_Other: 0807 return QCoreApplication::translate("KCharSelectData", "Letter, Other"); 0808 case QChar::Letter_Titlecase: 0809 return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase"); 0810 case QChar::Letter_Uppercase: 0811 return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase"); 0812 case QChar::Mark_SpacingCombining: 0813 return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining"); 0814 case QChar::Mark_Enclosing: 0815 return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing"); 0816 case QChar::Mark_NonSpacing: 0817 return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing"); 0818 case QChar::Number_DecimalDigit: 0819 return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit"); 0820 case QChar::Number_Letter: 0821 return QCoreApplication::translate("KCharSelectData", "Number, Letter"); 0822 case QChar::Number_Other: 0823 return QCoreApplication::translate("KCharSelectData", "Number, Other"); 0824 case QChar::Punctuation_Connector: 0825 return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector"); 0826 case QChar::Punctuation_Dash: 0827 return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash"); 0828 case QChar::Punctuation_Close: 0829 return QCoreApplication::translate("KCharSelectData", "Punctuation, Close"); 0830 case QChar::Punctuation_FinalQuote: 0831 return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote"); 0832 case QChar::Punctuation_InitialQuote: 0833 return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote"); 0834 case QChar::Punctuation_Other: 0835 return QCoreApplication::translate("KCharSelectData", "Punctuation, Other"); 0836 case QChar::Punctuation_Open: 0837 return QCoreApplication::translate("KCharSelectData", "Punctuation, Open"); 0838 case QChar::Symbol_Currency: 0839 return QCoreApplication::translate("KCharSelectData", "Symbol, Currency"); 0840 case QChar::Symbol_Modifier: 0841 return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier"); 0842 case QChar::Symbol_Math: 0843 return QCoreApplication::translate("KCharSelectData", "Symbol, Math"); 0844 case QChar::Symbol_Other: 0845 return QCoreApplication::translate("KCharSelectData", "Symbol, Other"); 0846 case QChar::Separator_Line: 0847 return QCoreApplication::translate("KCharSelectData", "Separator, Line"); 0848 case QChar::Separator_Paragraph: 0849 return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph"); 0850 case QChar::Separator_Space: 0851 return QCoreApplication::translate("KCharSelectData", "Separator, Space"); 0852 default: 0853 return QCoreApplication::translate("KCharSelectData", "Unknown"); 0854 } 0855 } 0856 0857 QVector<uint> KCharSelectData::find(const QString &needle) 0858 { 0859 QSet<uint> result; 0860 0861 QVector<uint> returnRes; 0862 QString simplified = needle.length() > 1 ? needle.simplified() : needle; 0863 QStringList searchStrings; 0864 0865 static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$")); 0866 if (octalExp.match(simplified).hasMatch()) { 0867 // search for C octal escaped UTF-8 0868 QByteArray utf8; 0869 int byte = -1; 0870 for (int i = 0; i <= simplified.length(); ++i) { 0871 int c = simplified.at(i).unicode(); 0872 if (c >= '0' && c <= '7') { 0873 byte = 8 * byte + c - '0'; 0874 } else if (byte == -1) { 0875 byte = 0; 0876 } else if (byte >= 0x00 && byte <= 0xFF) { 0877 utf8.append((char)byte); 0878 byte = 0; 0879 } 0880 } 0881 simplified = QString::fromUtf8(utf8); 0882 } 0883 0884 if (simplified.length() <= 2) { 0885 QVector<uint> ucs4 = simplified.toUcs4(); 0886 if (ucs4.size() == 1) { 0887 // search for hex representation of the character 0888 searchStrings = QStringList(formatCode(ucs4.at(0))); 0889 } else { 0890 searchStrings = splitString(simplified); 0891 } 0892 } else { 0893 searchStrings = splitString(simplified); 0894 } 0895 0896 if (searchStrings.isEmpty()) { 0897 return returnRes; 0898 } 0899 0900 static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$")); 0901 for (const QString &s : std::as_const(searchStrings)) { 0902 const QRegularExpressionMatch match = hexExp.match(s); 0903 if (match.hasMatch()) { 0904 const QString cap = match.captured(1); 0905 returnRes.append(cap.toInt(nullptr, 16)); 0906 // search for "1234" instead of "0x1234" 0907 if (s.length() == 6 || s.length() == 7) { 0908 searchStrings[searchStrings.indexOf(s)] = cap; 0909 } 0910 } 0911 // try to parse string as decimal number 0912 bool ok; 0913 int unicode = s.toInt(&ok); 0914 if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) { 0915 returnRes.append(unicode); 0916 } 0917 } 0918 0919 bool firstSubString = true; 0920 for (const QString &s : std::as_const(searchStrings)) { 0921 QSet<uint> partResult = getMatchingChars(s.toLower()); 0922 if (firstSubString) { 0923 result = partResult; 0924 firstSubString = false; 0925 } else { 0926 result = result.intersect(partResult); 0927 } 0928 } 0929 0930 // remove results found by matching the code point to prevent duplicate results 0931 // while letting these characters stay at the beginning 0932 for (uint c : std::as_const(returnRes)) { 0933 result.remove(c); 0934 } 0935 0936 QVector<uint> sortedResult; 0937 sortedResult.reserve(result.count()); 0938 for (auto c : std::as_const(result)) { 0939 sortedResult.append(c); 0940 } 0941 std::sort(sortedResult.begin(), sortedResult.end()); 0942 0943 returnRes += sortedResult; 0944 return returnRes; 0945 } 0946 0947 QSet<uint> KCharSelectData::getMatchingChars(const QString &s) 0948 { 0949 if (dataFile.isEmpty()) { 0950 return QSet<uint>(); 0951 } 0952 futureIndex.waitForFinished(); 0953 const Index index = futureIndex.result(); 0954 Index::const_iterator pos = index.lowerBound(s); 0955 QSet<uint> result; 0956 0957 while (pos != index.constEnd() && pos.key().startsWith(s)) { 0958 for (quint16 c : pos.value()) { 0959 result.insert(mapDataBaseToCodePoint(c)); 0960 } 0961 ++pos; 0962 } 0963 0964 return result; 0965 } 0966 0967 QStringList KCharSelectData::splitString(const QString &s) 0968 { 0969 QStringList result; 0970 int start = 0; 0971 int end = 0; 0972 int length = s.length(); 0973 while (end < length) { 0974 while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { 0975 end++; 0976 } 0977 if (start != end) { 0978 result.append(s.mid(start, end - start)); 0979 } 0980 start = end; 0981 while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { 0982 end++; 0983 start++; 0984 } 0985 } 0986 return result; 0987 } 0988 0989 void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s) 0990 { 0991 const QStringList strings = splitString(s); 0992 for (const QString &s : strings) { 0993 (*index)[s.toLower()].append(unicode); 0994 } 0995 } 0996 0997 Index KCharSelectData::createIndex(const QByteArray &dataFile) 0998 { 0999 Index i; 1000 1001 // character names 1002 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 1003 const char *data = dataFile.constData(); 1004 const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(udata + 4); 1005 const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(udata + 8); 1006 1007 int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1; 1008 1009 for (int pos = 0; pos <= max; pos++) { 1010 const quint16 unicode = qFromLittleEndian<quint16>(udata + nameOffsetBegin + pos * 6); 1011 quint32 offset = qFromLittleEndian<quint32>(udata + nameOffsetBegin + pos * 6 + 2); 1012 appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1)); 1013 } 1014 1015 // details 1016 const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(udata + 12); 1017 const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(udata + 16); 1018 1019 max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1; 1020 1021 for (int pos = 0; pos <= max; pos++) { 1022 const quint16 unicode = qFromLittleEndian<quint16>(udata + detailsOffsetBegin + pos * 27); 1023 1024 // aliases 1025 const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6); 1026 quint32 aliasOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 2); 1027 1028 for (int j = 0; j < aliasCount; j++) { 1029 appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset)); 1030 aliasOffset += qstrlen(data + aliasOffset) + 1; 1031 } 1032 1033 // notes 1034 const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11); 1035 quint32 notesOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 7); 1036 1037 for (int j = 0; j < notesCount; j++) { 1038 appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset)); 1039 notesOffset += qstrlen(data + notesOffset) + 1; 1040 } 1041 1042 // approximate equivalents 1043 const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16); 1044 quint32 apprOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 12); 1045 1046 for (int j = 0; j < apprCount; j++) { 1047 appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset)); 1048 apprOffset += qstrlen(data + apprOffset) + 1; 1049 } 1050 1051 // equivalents 1052 const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21); 1053 quint32 equivOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 17); 1054 1055 for (int j = 0; j < equivCount; j++) { 1056 appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset)); 1057 equivOffset += qstrlen(data + equivOffset) + 1; 1058 } 1059 1060 // see also - convert to string (hex) 1061 const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26); 1062 quint32 seeAlsoOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 22); 1063 1064 for (int j = 0; j < seeAlsoCount; j++) { 1065 quint16 seeAlso = qFromLittleEndian<quint16>(udata + seeAlsoOffset); 1066 appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString())); 1067 equivOffset += qstrlen(data + equivOffset) + 1; 1068 } 1069 } 1070 1071 // unihan data 1072 // temporary disabled due to the huge amount of data 1073 // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36); 1074 // const quint32 unihanOffsetEnd = dataFile.size(); 1075 // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1; 1076 // 1077 // for (int pos = 0; pos <= max; pos++) { 1078 // const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30); 1079 // for(int j = 0; j < 7; j++) { 1080 // quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4); 1081 // if(offset != 0) { 1082 // appendToIndex(&i, unicode, QString::fromUtf8(data + offset)); 1083 // } 1084 // } 1085 // } 1086 1087 return i; 1088 }