File indexing completed on 2024-04-28 03:59:01
0001 /* 0002 This file is part of the KDE libraries 0003 SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de> 0004 0005 SPDX-License-Identifier: LGPL-2.0-or-later 0006 */ 0007 0008 #include "kcharselectdata_p.h" 0009 0010 #include <QCoreApplication> 0011 #include <QFile> 0012 #include <QFutureInterface> 0013 #include <QRegularExpression> 0014 #include <QRunnable> 0015 #include <QStringList> 0016 #include <QThreadPool> 0017 #include <qendian.h> 0018 0019 #include <../test-config.h> 0020 #include <qstandardpaths.h> 0021 #include <string.h> 0022 0023 /* constants for hangul (de)composition, see UAX #15 */ 0024 #define SBase 0xAC00 0025 #define LBase 0x1100 0026 #define VBase 0x1161 0027 #define TBase 0x11A7 0028 #define LCount 19 0029 #define VCount 21 0030 #define TCount 28 0031 #define NCount (VCount * TCount) 0032 #define SCount (LCount * NCount) 0033 0034 class RunIndexCreation : public QFutureInterface<Index>, public QRunnable 0035 { 0036 public: 0037 RunIndexCreation(KCharSelectData *data, const QByteArray &dataFile) 0038 : m_data(data) 0039 , m_dataFile(dataFile) 0040 { 0041 } 0042 0043 QFuture<Index> start() 0044 { 0045 setRunnable(this); 0046 reportStarted(); 0047 QFuture<Index> f = this->future(); 0048 QThreadPool::globalInstance()->start(this); 0049 return f; 0050 } 0051 0052 void run() override 0053 { 0054 Index index = m_data->createIndex(m_dataFile); 0055 reportResult(index); 0056 reportFinished(nullptr); 0057 } 0058 0059 private: 0060 KCharSelectData *const m_data; 0061 const QByteArray m_dataFile; 0062 }; 0063 0064 // clang-format off 0065 static const char JAMO_L_TABLE[][4] = { 0066 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", 0067 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" 0068 }; 0069 0070 static const char JAMO_V_TABLE[][4] = { 0071 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", 0072 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", 0073 "YU", "EU", "YI", "I" 0074 }; 0075 0076 static const char JAMO_T_TABLE[][4] = { 0077 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", 0078 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", 0079 "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 0080 }; 0081 // clang-format on 0082 0083 bool KCharSelectData::openDataFile() 0084 { 0085 if (!dataFile.isEmpty()) { 0086 return true; 0087 } else { 0088 QFile file(QStringLiteral(":/kf6/kcharselect/kcharselect-data")); 0089 file.open(QIODevice::ReadOnly); 0090 dataFile = file.readAll(); 0091 file.close(); 0092 if (dataFile.size() < 40) { 0093 dataFile.clear(); 0094 return false; 0095 } 0096 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0097 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20); 0098 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24); 0099 uint blocks = (offsetEnd - offsetBegin) / 4; 0100 if (blocks <= 167) { // maximum possible number of blocks in BMP 0101 // no remapping 0102 remapType = -1; 0103 } else if (blocks >= 174 && blocks <= 180) { 0104 // remapping introduced in 5.25 0105 remapType = 0; 0106 } else { 0107 // unknown remapping, abort 0108 dataFile.clear(); 0109 return false; 0110 } 0111 futureIndex = (new RunIndexCreation(this, dataFile))->start(); 0112 return true; 0113 } 0114 } 0115 0116 // Temporary remapping code points <-> 16 bit database codes 0117 // See kcharselect-generate-datafile.py for details 0118 0119 quint16 KCharSelectData::mapCodePointToDataBase(uint code) const 0120 { 0121 if (remapType == 0) { 0122 if (code >= 0xE000 && code <= 0xEFFF) { 0123 return 0xFFFF; 0124 } 0125 if (code >= 0xF000 && code <= 0xFFFF) { 0126 return code - 0x1000; 0127 } 0128 if (code >= 0x1F000 && code <= 0x1FFFF) { 0129 return code - 0x10000; 0130 } 0131 } 0132 if (code >= 0x10000) { 0133 return 0xFFFF; 0134 } 0135 return code; 0136 } 0137 0138 uint KCharSelectData::mapDataBaseToCodePoint(quint16 code) const 0139 { 0140 if (remapType == 0) { 0141 if (code >= 0xE000 && code <= 0xEFFF) { 0142 return code + 0x1000; 0143 } 0144 if (code >= 0xF000) { 0145 return code + 0x10000; 0146 } 0147 } 0148 return code; 0149 } 0150 0151 quint32 KCharSelectData::getDetailIndex(uint c) const 0152 { 0153 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0154 // Convert from little-endian, so that this code works on PPC too. 0155 // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=482286 0156 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 12); 0157 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 16); 0158 0159 int min = 0; 0160 int mid; 0161 int max = ((offsetEnd - offsetBegin) / 27) - 1; 0162 0163 quint16 unicode = mapCodePointToDataBase(c); 0164 if (unicode == 0xFFFF) { 0165 return 0; 0166 } 0167 0168 static quint16 most_recent_searched; 0169 static quint32 most_recent_result; 0170 0171 if (unicode == most_recent_searched) { 0172 return most_recent_result; 0173 } 0174 0175 most_recent_searched = unicode; 0176 0177 while (max >= min) { 0178 mid = (min + max) / 2; 0179 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 27); 0180 if (unicode > midUnicode) { 0181 min = mid + 1; 0182 } else if (unicode < midUnicode) { 0183 max = mid - 1; 0184 } else { 0185 most_recent_result = offsetBegin + mid * 27; 0186 0187 return most_recent_result; 0188 } 0189 } 0190 0191 most_recent_result = 0; 0192 return 0; 0193 } 0194 0195 QString KCharSelectData::formatCode(uint code, int length, const QString &prefix, int base) 0196 { 0197 QString s = QString::number(code, base).toUpper(); 0198 while (s.size() < length) { 0199 s.prepend(QLatin1Char('0')); 0200 } 0201 s.prepend(prefix); 0202 return s; 0203 } 0204 0205 QList<uint> KCharSelectData::blockContents(int block) 0206 { 0207 if (!openDataFile()) { 0208 return QList<uint>(); 0209 } 0210 0211 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0212 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20); 0213 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24); 0214 0215 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0216 0217 QList<uint> res; 0218 0219 if (block > max) { 0220 return res; 0221 } 0222 0223 quint16 unicodeBegin = qFromLittleEndian<quint16>(data + offsetBegin + block * 4); 0224 quint16 unicodeEnd = qFromLittleEndian<quint16>(data + offsetBegin + block * 4 + 2); 0225 0226 while (unicodeBegin < unicodeEnd) { 0227 res.append(mapDataBaseToCodePoint(unicodeBegin)); 0228 unicodeBegin++; 0229 } 0230 res.append(mapDataBaseToCodePoint(unicodeBegin)); // Be careful when unicodeEnd==0xffff 0231 0232 return res; 0233 } 0234 0235 QList<int> KCharSelectData::sectionContents(int section) 0236 { 0237 section -= 1; 0238 if (!openDataFile()) { 0239 return QList<int>(); 0240 } 0241 0242 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0243 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28); 0244 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32); 0245 0246 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0247 0248 QList<int> res; 0249 0250 if (section > max) { 0251 return res; 0252 } 0253 0254 for (int i = 0; i <= max; i++) { 0255 const quint16 currSection = qFromLittleEndian<quint16>(data + offsetBegin + i * 4); 0256 if (currSection == section || section < 0) { 0257 res.append(qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2)); 0258 } 0259 } 0260 0261 return res; 0262 } 0263 0264 QStringList KCharSelectData::sectionList() 0265 { 0266 if (!openDataFile()) { 0267 return QStringList(); 0268 } 0269 0270 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0271 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24); 0272 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28); 0273 0274 const char *data = dataFile.constData(); 0275 QStringList list; 0276 quint32 i = stringBegin; 0277 list.append(QCoreApplication::translate("KCharSelectData", "All", "KCharSelect section name")); 0278 while (i < stringEnd) { 0279 list.append(QCoreApplication::translate("KCharSelectData", data + i, "KCharSelect section name")); 0280 i += qstrlen(data + i) + 1; 0281 } 0282 0283 return list; 0284 } 0285 0286 QString KCharSelectData::block(uint c) 0287 { 0288 return blockName(blockIndex(c)); 0289 } 0290 0291 QString KCharSelectData::section(uint c) 0292 { 0293 return sectionName(sectionIndex(blockIndex(c))); 0294 } 0295 0296 QString KCharSelectData::name(uint c) 0297 { 0298 if (!openDataFile()) { 0299 return QString(); 0300 } 0301 0302 if ((c & 0xFFFE) == 0xFFFE || (c >= 0xFDD0 && c <= 0xFDEF)) { 0303 return QCoreApplication::translate("KCharSelectData", "<noncharacter>"); 0304 } else if ((c >= 0x3400 && c <= 0x4DBF) || (c >= 0x4E00 && c <= 0x9FFF) || (c >= 0x20000 && c <= 0x2F7FF)) { 0305 return QLatin1String("CJK UNIFIED IDEOGRAPH-") + formatCode(c, 4, QString()); 0306 } else if (c >= 0xAC00 && c <= 0xD7AF) { 0307 /* compute hangul syllable name as per UAX #15 */ 0308 int SIndex = c - SBase; 0309 int LIndex; 0310 int VIndex; 0311 int TIndex; 0312 0313 if (SIndex < 0 || SIndex >= SCount) { 0314 return QString(); 0315 } 0316 0317 LIndex = SIndex / NCount; 0318 VIndex = (SIndex % NCount) / TCount; 0319 TIndex = SIndex % TCount; 0320 0321 return QLatin1String("HANGUL SYLLABLE ") + QLatin1String(JAMO_L_TABLE[LIndex]) + QLatin1String(JAMO_V_TABLE[VIndex]) 0322 + QLatin1String(JAMO_T_TABLE[TIndex]); 0323 } else if (c >= 0xD800 && c <= 0xDB7F) { 0324 return QCoreApplication::translate("KCharSelectData", "<Non Private Use High Surrogate>"); 0325 } else if (c >= 0xDB80 && c <= 0xDBFF) { 0326 return QCoreApplication::translate("KCharSelectData", "<Private Use High Surrogate>"); 0327 } else if (c >= 0xDC00 && c <= 0xDFFF) { 0328 return QCoreApplication::translate("KCharSelectData", "<Low Surrogate>"); 0329 } else if ((c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) { 0330 return QCoreApplication::translate("KCharSelectData", "<Private Use>"); 0331 } else if ((c >= 0xF900 && c <= 0xFAFF) || (c >= 0x2F800 && c <= 0x2FFFF)) { 0332 return QLatin1String("CJK COMPATIBILITY IDEOGRAPH-") + formatCode(c, 4, QString()); 0333 } 0334 quint16 unicode = mapCodePointToDataBase(c); 0335 if (unicode == 0xFFFF) { 0336 return QLatin1String("NON-BMP-CHARACTER-") + formatCode(c, 4, QString()); 0337 } else { 0338 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0339 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4); 0340 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8); 0341 0342 int min = 0; 0343 int mid; 0344 int max = ((offsetEnd - offsetBegin) / 6) - 1; 0345 QString s; 0346 0347 while (max >= min) { 0348 mid = (min + max) / 2; 0349 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6); 0350 if (unicode > midUnicode) { 0351 min = mid + 1; 0352 } else if (unicode < midUnicode) { 0353 max = mid - 1; 0354 } else { 0355 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2); 0356 s = QString::fromUtf8(dataFile.constData() + offset + 1); 0357 break; 0358 } 0359 } 0360 0361 if (s.isNull()) { 0362 return QCoreApplication::translate("KCharSelectData", "<not assigned>"); 0363 } else { 0364 return s; 0365 } 0366 } 0367 } 0368 0369 int KCharSelectData::blockIndex(uint c) 0370 { 0371 if (!openDataFile()) { 0372 return 0; 0373 } 0374 0375 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0376 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 20); 0377 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 24); 0378 const quint16 unicode = mapCodePointToDataBase(c); 0379 if (unicode == 0xFFFF) { 0380 return 0; 0381 } 0382 0383 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0384 0385 int i = 0; 0386 0387 while (unicode > qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) && i < max) { 0388 i++; 0389 } 0390 0391 return i; 0392 } 0393 0394 int KCharSelectData::sectionIndex(int block) 0395 { 0396 if (!openDataFile()) { 0397 return 0; 0398 } 0399 0400 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0401 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 28); 0402 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 32); 0403 0404 int max = ((offsetEnd - offsetBegin) / 4) - 1; 0405 0406 for (int i = 0; i <= max; i++) { 0407 if (qFromLittleEndian<quint16>(data + offsetBegin + i * 4 + 2) == block) { 0408 return qFromLittleEndian<quint16>(data + offsetBegin + i * 4) + 1; 0409 } 0410 } 0411 0412 return 0; 0413 } 0414 0415 QString KCharSelectData::blockName(int index) 0416 { 0417 if (!openDataFile()) { 0418 return QString(); 0419 } 0420 0421 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0422 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 16); 0423 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 20); 0424 0425 quint32 i = stringBegin; 0426 int currIndex = 0; 0427 0428 const char *data = dataFile.constData(); 0429 while (i < stringEnd && currIndex < index) { 0430 i += qstrlen(data + i) + 1; 0431 currIndex++; 0432 } 0433 0434 return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode block name"); 0435 } 0436 0437 QString KCharSelectData::sectionName(int index) 0438 { 0439 if (index == 0) { 0440 return QCoreApplication::translate("KCharSelectData", "All", "KCharselect unicode section name"); 0441 } 0442 if (!openDataFile()) { 0443 return QString(); 0444 } 0445 0446 index -= 1; 0447 0448 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0449 const quint32 stringBegin = qFromLittleEndian<quint32>(udata + 24); 0450 const quint32 stringEnd = qFromLittleEndian<quint32>(udata + 28); 0451 0452 quint32 i = stringBegin; 0453 int currIndex = 0; 0454 0455 const char *data = dataFile.constData(); 0456 while (i < stringEnd && currIndex < index) { 0457 i += qstrlen(data + i) + 1; 0458 currIndex++; 0459 } 0460 0461 return QCoreApplication::translate("KCharSelectData", data + i, "KCharselect unicode section name"); 0462 } 0463 0464 QStringList KCharSelectData::aliases(uint c) 0465 { 0466 if (!openDataFile()) { 0467 return QStringList(); 0468 } 0469 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0470 const int detailIndex = getDetailIndex(c); 0471 if (detailIndex == 0) { 0472 return QStringList(); 0473 } 0474 0475 const quint8 count = *(quint8 *)(udata + detailIndex + 6); 0476 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 2); 0477 0478 QStringList aliases; 0479 aliases.reserve(count); 0480 0481 const char *data = dataFile.constData(); 0482 for (int i = 0; i < count; i++) { 0483 aliases.append(QString::fromUtf8(data + offset)); 0484 offset += qstrlen(data + offset) + 1; 0485 } 0486 return aliases; 0487 } 0488 0489 QStringList KCharSelectData::notes(uint c) 0490 { 0491 if (!openDataFile()) { 0492 return QStringList(); 0493 } 0494 const int detailIndex = getDetailIndex(c); 0495 if (detailIndex == 0) { 0496 return QStringList(); 0497 } 0498 0499 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0500 const quint8 count = *(quint8 *)(udata + detailIndex + 11); 0501 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 7); 0502 0503 QStringList notes; 0504 notes.reserve(count); 0505 0506 const char *data = dataFile.constData(); 0507 for (int i = 0; i < count; i++) { 0508 notes.append(QString::fromUtf8(data + offset)); 0509 offset += qstrlen(data + offset) + 1; 0510 } 0511 0512 return notes; 0513 } 0514 0515 QList<uint> KCharSelectData::seeAlso(uint c) 0516 { 0517 if (!openDataFile()) { 0518 return QList<uint>(); 0519 } 0520 const int detailIndex = getDetailIndex(c); 0521 if (detailIndex == 0) { 0522 return QList<uint>(); 0523 } 0524 0525 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0526 const quint8 count = *(quint8 *)(udata + detailIndex + 26); 0527 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 22); 0528 0529 QList<uint> seeAlso; 0530 seeAlso.reserve(count); 0531 0532 for (int i = 0; i < count; i++) { 0533 seeAlso.append(mapDataBaseToCodePoint(qFromLittleEndian<quint16>(udata + offset))); 0534 offset += 2; 0535 } 0536 0537 return seeAlso; 0538 } 0539 0540 QStringList KCharSelectData::equivalents(uint c) 0541 { 0542 if (!openDataFile()) { 0543 return QStringList(); 0544 } 0545 const int detailIndex = getDetailIndex(c); 0546 if (detailIndex == 0) { 0547 return QStringList(); 0548 } 0549 0550 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0551 const quint8 count = *(quint8 *)(udata + detailIndex + 21); 0552 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 17); 0553 0554 QStringList equivalents; 0555 equivalents.reserve(count); 0556 0557 const char *data = dataFile.constData(); 0558 for (int i = 0; i < count; i++) { 0559 equivalents.append(QString::fromUtf8(data + offset)); 0560 offset += qstrlen(data + offset) + 1; 0561 } 0562 0563 return equivalents; 0564 } 0565 0566 QStringList KCharSelectData::approximateEquivalents(uint c) 0567 { 0568 if (!openDataFile()) { 0569 return QStringList(); 0570 } 0571 const int detailIndex = getDetailIndex(c); 0572 if (detailIndex == 0) { 0573 return QStringList(); 0574 } 0575 0576 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0577 const quint8 count = *(quint8 *)(udata + detailIndex + 16); 0578 quint32 offset = qFromLittleEndian<quint32>(udata + detailIndex + 12); 0579 0580 QStringList approxEquivalents; 0581 approxEquivalents.reserve(count); 0582 0583 const char *data = dataFile.constData(); 0584 for (int i = 0; i < count; i++) { 0585 approxEquivalents.append(QString::fromUtf8(data + offset)); 0586 offset += qstrlen(data + offset) + 1; 0587 } 0588 0589 return approxEquivalents; 0590 } 0591 0592 QList<uint> KCharSelectData::decomposition(uint c) 0593 { 0594 // for now, only decompose Hangul Syllable into Hangul Jamo 0595 uint SIndex = c - SBase; 0596 if (SIndex >= SCount) { 0597 return QList<uint>(); 0598 } 0599 0600 uint L = LBase + SIndex / NCount; // Choseong 0601 uint V = VBase + (SIndex % NCount) / TCount; // Jungseong 0602 uint T = TBase + SIndex % TCount; // Jongsung 0603 QList<uint> jamoList; 0604 jamoList.append(L); 0605 jamoList.append(V); 0606 if (T != TBase) { 0607 jamoList.append(T); 0608 } 0609 return jamoList; 0610 } 0611 0612 QStringList KCharSelectData::unihanInfo(uint c) 0613 { 0614 if (!openDataFile()) { 0615 return QStringList(); 0616 } 0617 0618 quint16 unicode = mapCodePointToDataBase(c); 0619 if (unicode == 0xFFFF) { 0620 return QStringList(); 0621 } 0622 0623 const char *data = dataFile.constData(); 0624 const uchar *udata = reinterpret_cast<const uchar *>(data); 0625 const quint32 offsetBegin = qFromLittleEndian<quint32>(udata + 36); 0626 const quint32 offsetEnd = dataFile.size(); 0627 0628 int min = 0; 0629 int mid; 0630 int max = ((offsetEnd - offsetBegin) / 30) - 1; 0631 0632 while (max >= min) { 0633 mid = (min + max) / 2; 0634 const quint16 midUnicode = qFromLittleEndian<quint16>(udata + offsetBegin + mid * 30); 0635 if (unicode > midUnicode) { 0636 min = mid + 1; 0637 } else if (unicode < midUnicode) { 0638 max = mid - 1; 0639 } else { 0640 QStringList res; 0641 res.reserve(7); 0642 for (int i = 0; i < 7; i++) { 0643 quint32 offset = qFromLittleEndian<quint32>(udata + offsetBegin + mid * 30 + 2 + i * 4); 0644 if (offset != 0) { 0645 res.append(QString::fromUtf8(data + offset)); 0646 } else { 0647 res.append(QString()); 0648 } 0649 } 0650 return res; 0651 } 0652 } 0653 0654 return QStringList(); 0655 } 0656 0657 QChar::Category KCharSelectData::category(uint c) 0658 { 0659 if (!openDataFile()) { 0660 return QChar::category(c); 0661 } 0662 0663 ushort unicode = mapCodePointToDataBase(c); 0664 if (unicode == 0xFFFF) { 0665 return QChar::category(c); 0666 } 0667 0668 const uchar *data = reinterpret_cast<const uchar *>(dataFile.constData()); 0669 const quint32 offsetBegin = qFromLittleEndian<quint32>(data + 4); 0670 const quint32 offsetEnd = qFromLittleEndian<quint32>(data + 8); 0671 0672 int min = 0; 0673 int mid; 0674 int max = ((offsetEnd - offsetBegin) / 6) - 1; 0675 0676 while (max >= min) { 0677 mid = (min + max) / 2; 0678 const quint16 midUnicode = qFromLittleEndian<quint16>(data + offsetBegin + mid * 6); 0679 if (unicode > midUnicode) { 0680 min = mid + 1; 0681 } else if (unicode < midUnicode) { 0682 max = mid - 1; 0683 } else { 0684 quint32 offset = qFromLittleEndian<quint32>(data + offsetBegin + mid * 6 + 2); 0685 uchar categoryCode = *(data + offset); 0686 Q_ASSERT(categoryCode > 0); 0687 categoryCode--; /* Qt5 changed QChar::Category enum to start from 0 instead of 1 0688 See QtBase commit d17c76feee9eece4 */ 0689 return QChar::Category(categoryCode); 0690 } 0691 } 0692 0693 return QChar::category(c); 0694 } 0695 0696 bool KCharSelectData::isPrint(uint c) 0697 { 0698 QChar::Category cat = category(c); 0699 return !(cat == QChar::Other_Control || cat == QChar::Other_NotAssigned); 0700 } 0701 0702 bool KCharSelectData::isDisplayable(uint c) 0703 { 0704 // Qt internally uses U+FDD0 and U+FDD1 to mark the beginning and the end of frames. 0705 // They should be seen as non-printable characters, as trying to display them leads 0706 // to a crash caused by a Qt "noBlockInString" assertion. 0707 if (c == 0xFDD0 || c == 0xFDD1) { 0708 return false; 0709 } 0710 0711 return !isIgnorable(c) && isPrint(c); 0712 } 0713 0714 bool KCharSelectData::isIgnorable(uint c) 0715 { 0716 /* 0717 * According to the Unicode standard, Default Ignorable Code Points 0718 * should be ignored unless explicitly supported. For example, U+202E 0719 * RIGHT-TO-LEFT-OVERRIDE ir printable according to Qt, but displaying 0720 * it gives the undesired effect of all text being turned RTL. We do not 0721 * have a way to "explicitly" support it, so we will treat it as 0722 * non-printable. 0723 * 0724 * There is a list of these on 0725 * http://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt under the 0726 * property Default_Ignorable_Code_Point. 0727 */ 0728 0729 // NOTE: not very nice to hardcode these here; is it worth it to modify 0730 // the binary data file to hold them? 0731 // clang-format off 0732 return c == 0x00AD || c == 0x034F || c == 0x115F || c == 0x1160 || 0733 c == 0x17B4 || c == 0x17B5 || (c >= 0x180B && c <= 0x180D) || 0734 (c >= 0x200B && c <= 0x200F) || (c >= 0x202A && c <= 0x202E) || 0735 (c >= 0x2060 && c <= 0x206F) || c == 0x3164 || 0736 (c >= 0xFE00 && c <= 0xFE0F) || c == 0xFEFF || c == 0xFFA0 || 0737 (c >= 0xFFF0 && c <= 0xFFF8); 0738 // clang-format on 0739 } 0740 0741 bool KCharSelectData::isCombining(uint c) 0742 { 0743 return section(c) == QCoreApplication::translate("KCharSelectData", "Combining Diacritics", "KCharSelect section name"); 0744 // FIXME: this is an imperfect test. There are many combining characters 0745 // that are outside of this section. See Grapheme_Extend in 0746 // http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt 0747 } 0748 0749 QString KCharSelectData::display(uint c, const QFont &font) 0750 { 0751 if (!isDisplayable(c)) { 0752 return QLatin1String("<b>") + QCoreApplication::translate("KCharSelectData", "Non-printable") + QLatin1String("</b>"); 0753 } else { 0754 QString s = QLatin1String("<font size=\"+4\" face=\"") + font.family() + QLatin1String("\">"); 0755 if (isCombining(c)) { 0756 s += displayCombining(c); 0757 } else { 0758 s += QLatin1String("&#") + QString::number(c) + QLatin1Char(';'); 0759 } 0760 s += QLatin1String("</font>"); 0761 return s; 0762 } 0763 } 0764 0765 QString KCharSelectData::displayCombining(uint c) 0766 { 0767 /* 0768 * The purpose of this is to make it easier to see how a combining 0769 * character affects the text around it. 0770 * The initial plan was to use U+25CC DOTTED CIRCLE for this purpose, 0771 * as seen in pdfs from Unicode, but there seem to be a lot of alignment 0772 * problems with that. 0773 * 0774 * Eventually, it would be nice to determine whether the character 0775 * combines to the left or to the right, etc. 0776 */ 0777 QString s = QLatin1String(" &#") + QString::number(c) + QLatin1String("; ") + QLatin1String(" (ab&#") + QString::number(c) + QLatin1String(";c)"); 0778 return s; 0779 } 0780 0781 QString KCharSelectData::categoryText(QChar::Category category) 0782 { 0783 switch (category) { 0784 case QChar::Other_Control: 0785 return QCoreApplication::translate("KCharSelectData", "Other, Control"); 0786 case QChar::Other_Format: 0787 return QCoreApplication::translate("KCharSelectData", "Other, Format"); 0788 case QChar::Other_NotAssigned: 0789 return QCoreApplication::translate("KCharSelectData", "Other, Not Assigned"); 0790 case QChar::Other_PrivateUse: 0791 return QCoreApplication::translate("KCharSelectData", "Other, Private Use"); 0792 case QChar::Other_Surrogate: 0793 return QCoreApplication::translate("KCharSelectData", "Other, Surrogate"); 0794 case QChar::Letter_Lowercase: 0795 return QCoreApplication::translate("KCharSelectData", "Letter, Lowercase"); 0796 case QChar::Letter_Modifier: 0797 return QCoreApplication::translate("KCharSelectData", "Letter, Modifier"); 0798 case QChar::Letter_Other: 0799 return QCoreApplication::translate("KCharSelectData", "Letter, Other"); 0800 case QChar::Letter_Titlecase: 0801 return QCoreApplication::translate("KCharSelectData", "Letter, Titlecase"); 0802 case QChar::Letter_Uppercase: 0803 return QCoreApplication::translate("KCharSelectData", "Letter, Uppercase"); 0804 case QChar::Mark_SpacingCombining: 0805 return QCoreApplication::translate("KCharSelectData", "Mark, Spacing Combining"); 0806 case QChar::Mark_Enclosing: 0807 return QCoreApplication::translate("KCharSelectData", "Mark, Enclosing"); 0808 case QChar::Mark_NonSpacing: 0809 return QCoreApplication::translate("KCharSelectData", "Mark, Non-Spacing"); 0810 case QChar::Number_DecimalDigit: 0811 return QCoreApplication::translate("KCharSelectData", "Number, Decimal Digit"); 0812 case QChar::Number_Letter: 0813 return QCoreApplication::translate("KCharSelectData", "Number, Letter"); 0814 case QChar::Number_Other: 0815 return QCoreApplication::translate("KCharSelectData", "Number, Other"); 0816 case QChar::Punctuation_Connector: 0817 return QCoreApplication::translate("KCharSelectData", "Punctuation, Connector"); 0818 case QChar::Punctuation_Dash: 0819 return QCoreApplication::translate("KCharSelectData", "Punctuation, Dash"); 0820 case QChar::Punctuation_Close: 0821 return QCoreApplication::translate("KCharSelectData", "Punctuation, Close"); 0822 case QChar::Punctuation_FinalQuote: 0823 return QCoreApplication::translate("KCharSelectData", "Punctuation, Final Quote"); 0824 case QChar::Punctuation_InitialQuote: 0825 return QCoreApplication::translate("KCharSelectData", "Punctuation, Initial Quote"); 0826 case QChar::Punctuation_Other: 0827 return QCoreApplication::translate("KCharSelectData", "Punctuation, Other"); 0828 case QChar::Punctuation_Open: 0829 return QCoreApplication::translate("KCharSelectData", "Punctuation, Open"); 0830 case QChar::Symbol_Currency: 0831 return QCoreApplication::translate("KCharSelectData", "Symbol, Currency"); 0832 case QChar::Symbol_Modifier: 0833 return QCoreApplication::translate("KCharSelectData", "Symbol, Modifier"); 0834 case QChar::Symbol_Math: 0835 return QCoreApplication::translate("KCharSelectData", "Symbol, Math"); 0836 case QChar::Symbol_Other: 0837 return QCoreApplication::translate("KCharSelectData", "Symbol, Other"); 0838 case QChar::Separator_Line: 0839 return QCoreApplication::translate("KCharSelectData", "Separator, Line"); 0840 case QChar::Separator_Paragraph: 0841 return QCoreApplication::translate("KCharSelectData", "Separator, Paragraph"); 0842 case QChar::Separator_Space: 0843 return QCoreApplication::translate("KCharSelectData", "Separator, Space"); 0844 default: 0845 return QCoreApplication::translate("KCharSelectData", "Unknown"); 0846 } 0847 } 0848 0849 QList<uint> KCharSelectData::find(const QString &needle) 0850 { 0851 QSet<uint> result; 0852 0853 QList<uint> returnRes; 0854 QString simplified = needle.length() > 1 ? needle.simplified() : needle; 0855 QStringList searchStrings; 0856 0857 static const QRegularExpression octalExp(QStringLiteral("^\\\\[0-7][0-7\\\\]*$")); 0858 if (octalExp.match(simplified).hasMatch()) { 0859 // search for C octal escaped UTF-8 0860 QByteArray utf8; 0861 int byte = -1; 0862 for (int i = 0; i <= simplified.length(); ++i) { 0863 int c = simplified.at(i).unicode(); 0864 if (c >= '0' && c <= '7') { 0865 byte = 8 * byte + c - '0'; 0866 } else if (byte == -1) { 0867 byte = 0; 0868 } else if (byte >= 0x00 && byte <= 0xFF) { 0869 utf8.append((char)byte); 0870 byte = 0; 0871 } 0872 } 0873 simplified = QString::fromUtf8(utf8); 0874 } 0875 0876 if (simplified.length() <= 2) { 0877 QList<uint> ucs4 = simplified.toUcs4(); 0878 if (ucs4.size() == 1) { 0879 // search for hex representation of the character 0880 searchStrings = QStringList(formatCode(ucs4.at(0))); 0881 } else { 0882 searchStrings = splitString(simplified); 0883 } 0884 } else { 0885 searchStrings = splitString(simplified); 0886 } 0887 0888 if (searchStrings.isEmpty()) { 0889 return returnRes; 0890 } 0891 0892 static const QRegularExpression hexExp(QStringLiteral("^(?:|u\\+|U\\+|0x|0X)([A-Fa-f0-9]{4,5})$")); 0893 for (const QString &s : std::as_const(searchStrings)) { 0894 const QRegularExpressionMatch match = hexExp.match(s); 0895 if (match.hasMatch()) { 0896 const QString cap = match.captured(1); 0897 returnRes.append(cap.toInt(nullptr, 16)); 0898 // search for "1234" instead of "0x1234" 0899 if (s.length() == 6 || s.length() == 7) { 0900 searchStrings[searchStrings.indexOf(s)] = cap; 0901 } 0902 } 0903 // try to parse string as decimal number 0904 bool ok; 0905 int unicode = s.toInt(&ok); 0906 if (ok && unicode >= 0 && unicode <= QChar::LastValidCodePoint) { 0907 returnRes.append(unicode); 0908 } 0909 } 0910 0911 bool firstSubString = true; 0912 for (const QString &s : std::as_const(searchStrings)) { 0913 QSet<uint> partResult = getMatchingChars(s.toLower()); 0914 if (firstSubString) { 0915 result = partResult; 0916 firstSubString = false; 0917 } else { 0918 result = result.intersect(partResult); 0919 } 0920 } 0921 0922 // remove results found by matching the code point to prevent duplicate results 0923 // while letting these characters stay at the beginning 0924 for (uint c : std::as_const(returnRes)) { 0925 result.remove(c); 0926 } 0927 0928 QList<uint> sortedResult; 0929 sortedResult.reserve(result.count()); 0930 for (auto c : std::as_const(result)) { 0931 sortedResult.append(c); 0932 } 0933 std::sort(sortedResult.begin(), sortedResult.end()); 0934 0935 returnRes += sortedResult; 0936 return returnRes; 0937 } 0938 0939 QSet<uint> KCharSelectData::getMatchingChars(const QString &s) 0940 { 0941 if (dataFile.isEmpty()) { 0942 return QSet<uint>(); 0943 } 0944 futureIndex.waitForFinished(); 0945 const Index index = futureIndex.result(); 0946 Index::const_iterator pos = index.lowerBound(s); 0947 QSet<uint> result; 0948 0949 while (pos != index.constEnd() && pos.key().startsWith(s)) { 0950 for (quint16 c : pos.value()) { 0951 result.insert(mapDataBaseToCodePoint(c)); 0952 } 0953 ++pos; 0954 } 0955 0956 return result; 0957 } 0958 0959 QStringList KCharSelectData::splitString(const QString &s) 0960 { 0961 QStringList result; 0962 int start = 0; 0963 int end = 0; 0964 int length = s.length(); 0965 while (end < length) { 0966 while (end < length && (s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { 0967 end++; 0968 } 0969 if (start != end) { 0970 result.append(s.mid(start, end - start)); 0971 } 0972 start = end; 0973 while (end < length && !(s[end].isLetterOrNumber() || s[end] == QLatin1Char('+'))) { 0974 end++; 0975 start++; 0976 } 0977 } 0978 return result; 0979 } 0980 0981 void KCharSelectData::appendToIndex(Index *index, quint16 unicode, const QString &s) 0982 { 0983 const QStringList strings = splitString(s); 0984 for (const QString &s : strings) { 0985 (*index)[s.toLower()].append(unicode); 0986 } 0987 } 0988 0989 Index KCharSelectData::createIndex(const QByteArray &dataFile) 0990 { 0991 Index i; 0992 0993 // character names 0994 const uchar *udata = reinterpret_cast<const uchar *>(dataFile.constData()); 0995 const char *data = dataFile.constData(); 0996 const quint32 nameOffsetBegin = qFromLittleEndian<quint32>(udata + 4); 0997 const quint32 nameOffsetEnd = qFromLittleEndian<quint32>(udata + 8); 0998 0999 int max = ((nameOffsetEnd - nameOffsetBegin) / 6) - 1; 1000 1001 for (int pos = 0; pos <= max; pos++) { 1002 const quint16 unicode = qFromLittleEndian<quint16>(udata + nameOffsetBegin + pos * 6); 1003 quint32 offset = qFromLittleEndian<quint32>(udata + nameOffsetBegin + pos * 6 + 2); 1004 appendToIndex(&i, unicode, QString::fromUtf8(data + offset + 1)); 1005 } 1006 1007 // details 1008 const quint32 detailsOffsetBegin = qFromLittleEndian<quint32>(udata + 12); 1009 const quint32 detailsOffsetEnd = qFromLittleEndian<quint32>(udata + 16); 1010 1011 max = ((detailsOffsetEnd - detailsOffsetBegin) / 27) - 1; 1012 1013 for (int pos = 0; pos <= max; pos++) { 1014 const quint16 unicode = qFromLittleEndian<quint16>(udata + detailsOffsetBegin + pos * 27); 1015 1016 // aliases 1017 const quint8 aliasCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 6); 1018 quint32 aliasOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 2); 1019 1020 for (int j = 0; j < aliasCount; j++) { 1021 appendToIndex(&i, unicode, QString::fromUtf8(data + aliasOffset)); 1022 aliasOffset += qstrlen(data + aliasOffset) + 1; 1023 } 1024 1025 // notes 1026 const quint8 notesCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 11); 1027 quint32 notesOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 7); 1028 1029 for (int j = 0; j < notesCount; j++) { 1030 appendToIndex(&i, unicode, QString::fromUtf8(data + notesOffset)); 1031 notesOffset += qstrlen(data + notesOffset) + 1; 1032 } 1033 1034 // approximate equivalents 1035 const quint8 apprCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 16); 1036 quint32 apprOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 12); 1037 1038 for (int j = 0; j < apprCount; j++) { 1039 appendToIndex(&i, unicode, QString::fromUtf8(data + apprOffset)); 1040 apprOffset += qstrlen(data + apprOffset) + 1; 1041 } 1042 1043 // equivalents 1044 const quint8 equivCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 21); 1045 quint32 equivOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 17); 1046 1047 for (int j = 0; j < equivCount; j++) { 1048 appendToIndex(&i, unicode, QString::fromUtf8(data + equivOffset)); 1049 equivOffset += qstrlen(data + equivOffset) + 1; 1050 } 1051 1052 // see also - convert to string (hex) 1053 const quint8 seeAlsoCount = *(quint8 *)(udata + detailsOffsetBegin + pos * 27 + 26); 1054 quint32 seeAlsoOffset = qFromLittleEndian<quint32>(udata + detailsOffsetBegin + pos * 27 + 22); 1055 1056 for (int j = 0; j < seeAlsoCount; j++) { 1057 quint16 seeAlso = qFromLittleEndian<quint16>(udata + seeAlsoOffset); 1058 appendToIndex(&i, unicode, formatCode(seeAlso, 4, QString())); 1059 equivOffset += qstrlen(data + equivOffset) + 1; 1060 } 1061 } 1062 1063 // unihan data 1064 // temporary disabled due to the huge amount of data 1065 // const quint32 unihanOffsetBegin = qFromLittleEndian<quint32>(udata+36); 1066 // const quint32 unihanOffsetEnd = dataFile.size(); 1067 // max = ((unihanOffsetEnd - unihanOffsetBegin) / 30) - 1; 1068 // 1069 // for (int pos = 0; pos <= max; pos++) { 1070 // const quint16 unicode = qFromLittleEndian<quint16>(udata + unihanOffsetBegin + pos*30); 1071 // for(int j = 0; j < 7; j++) { 1072 // quint32 offset = qFromLittleEndian<quint32>(udata + unihanOffsetBegin + pos*30 + 2 + j*4); 1073 // if(offset != 0) { 1074 // appendToIndex(&i, unicode, QString::fromUtf8(data + offset)); 1075 // } 1076 // } 1077 // } 1078 1079 return i; 1080 }