File indexing completed on 2024-05-05 05:54:13
0001 /* 0002 This file is part of Konsole, a terminal emulator for KDE. 0003 0004 SPDX-FileCopyrightText: 2018 Mariusz Glebocki <mglb@arccos-1.net> 0005 0006 SPDX-License-Identifier: GPL-2.0-or-later 0007 */ 0008 0009 #include "template.h" 0010 #include <QCommandLineParser> 0011 #include <QCoreApplication> 0012 #include <QEventLoop> 0013 #include <QFile> 0014 #include <QFileInfo> 0015 #include <QLoggingCategory> 0016 #include <QMap> 0017 #include <QRegularExpression> 0018 #include <QRegularExpressionMatch> 0019 #include <QString> 0020 #include <QTextStream> 0021 0022 #include <KIO/Job> 0023 0024 static constexpr unsigned int CODE_POINTS_NUM = 0x110000; 0025 static constexpr unsigned int LAST_CODE_POINT = CODE_POINTS_NUM - 1; 0026 0027 struct UcdEntry { 0028 struct { 0029 uint first; 0030 uint last; 0031 } cp; 0032 QStringList fields; 0033 }; 0034 0035 class UcdParserBase 0036 { 0037 public: 0038 ~UcdParserBase() 0039 { 0040 _source->close(); 0041 } 0042 0043 bool hasNext() 0044 { 0045 bool hadNext = _hasNext; 0046 if (!_nextFetched) { 0047 _hasNext = fetchNext(); 0048 _nextFetched = true; 0049 } 0050 return hadNext; 0051 } 0052 0053 protected: 0054 UcdParserBase(QIODevice *source, UcdEntry *entry) 0055 : _source(source) 0056 , _nextFetched(false) 0057 , _hasNext(true) 0058 , _lineNo(0) 0059 , _entry(entry) 0060 { 0061 Q_ASSERT(_source); 0062 Q_ASSERT(_entry); 0063 } 0064 0065 bool fetchNext() 0066 { 0067 Q_ASSERT(_source->isOpen()); 0068 if (!_source->isOpen()) 0069 return false; 0070 0071 static const QRegularExpression ENTRY_RE = QRegularExpression(QStringLiteral( 0072 // Match 1: "cp1" - first CP / "cp2" (optional) - last CP 0073 R"#((?:^(?<cp1>[[:xdigit:]]+)(?:\.\.(?<cp2>[[:xdigit:]]+))?[ \t]*;)#" 0074 // Match 1: "field0" - first data field" 0075 // "udRangeInd" (UnicodeData.txt only) - if present, the line is either first or last line of a range 0076 R"#([ \t]*(?<field0>[^#;\n]*?(?:, (?<udRangeInd>First|Last)>)?)[ \t]*(?:;|(?:\#.*)?$))|)#" 0077 // Match 2..n: "field" - n-th field 0078 R"#((?:\G(?<=;)[ \t]*(?<field>[^#;\n]*?)[ \t]*(?:;|(?:#.*)?$)))#")); 0079 static const QRegularExpression UD_RANGE_IND_RE(QStringLiteral(", (First|Last)")); 0080 static const QRegularExpression COMMENT_RE(QStringLiteral("^[ \t]*(#.*)?$")); 0081 0082 QString line; 0083 bool ok; 0084 _entry->fields.clear(); 0085 while (!_source->atEnd()) { 0086 line = QString::fromUtf8(_source->readLine()); 0087 _lineNo++; 0088 auto mit = ENTRY_RE.globalMatch(line); 0089 if (!mit.hasNext()) { 0090 // Do not complain about comments and empty lines 0091 if (!COMMENT_RE.match(line).hasMatch()) 0092 qDebug() << QStringLiteral("Line %1: does not match - skipping").arg(_lineNo); 0093 continue; 0094 } 0095 0096 auto match = mit.next(); 0097 _entry->cp.first = match.captured(QStringLiteral("cp1")).toUInt(&ok, 16); 0098 if (!ok) { 0099 qDebug() << QStringLiteral("Line %d Invalid cp1 - skipping").arg(_lineNo); 0100 continue; 0101 } 0102 _entry->cp.last = match.captured(QStringLiteral("cp2")).toUInt(&ok, 16); 0103 if (!ok) { 0104 _entry->cp.last = _entry->cp.first; 0105 } 0106 QString field0 = match.captured(QStringLiteral("field0")); 0107 if (field0.isNull()) { 0108 qDebug() << QStringLiteral("Line %d: Missing field0 - skipping").arg(_lineNo); 0109 continue; 0110 } 0111 if (!match.captured(QStringLiteral("udRangeInd")).isNull()) { 0112 if (match.captured(QStringLiteral("udRangeInd")) == QStringLiteral("First")) { 0113 // Fetch next valid line, as it pairs with the current one to form a range 0114 QRegularExpressionMatch nlMatch; 0115 int firstLineNo = _lineNo; 0116 while (!_source->atEnd() && !nlMatch.hasMatch()) { 0117 line = QString::fromUtf8(_source->readLine()); 0118 _lineNo++; 0119 nlMatch = ENTRY_RE.match(line); 0120 if (!nlMatch.hasMatch()) { 0121 qDebug() << QStringLiteral("Line %d: does not match - skipping").arg(_lineNo); 0122 } 0123 } 0124 if (nlMatch.hasMatch()) { 0125 _entry->cp.last = nlMatch.captured(QStringLiteral("cp1")).toUInt(&ok, 16); 0126 if (!ok) { 0127 qDebug() << QStringLiteral("Line %1-%2: Missing or invalid second cp1 (\"Last\" entry) - skipping").arg(firstLineNo).arg(_lineNo); 0128 continue; 0129 } 0130 } 0131 } 0132 field0.remove(UD_RANGE_IND_RE); 0133 } 0134 _entry->fields.append(field0); 0135 0136 while (mit.hasNext()) { 0137 _entry->fields.append(mit.next().captured(QStringLiteral("field"))); 0138 } 0139 0140 return !_source->atEnd(); 0141 } 0142 return false; 0143 } 0144 0145 QIODevice *_source; 0146 bool _nextFetched; 0147 bool _hasNext; 0148 0149 private: 0150 int _lineNo; 0151 UcdEntry *_entry; 0152 }; 0153 0154 template<class EntryType> 0155 class UcdParser : public UcdParserBase 0156 { 0157 public: 0158 static_assert(std::is_base_of<UcdEntry, EntryType>::value, "'EntryType' has to be derived from UcdParser::Entry"); 0159 0160 UcdParser(QIODevice *source) 0161 : UcdParserBase(source, &_typedEntry) 0162 { 0163 } 0164 0165 inline const EntryType &next() 0166 { 0167 if (!_nextFetched) 0168 fetchNext(); 0169 _nextFetched = false; 0170 return _typedEntry; 0171 } 0172 0173 private: 0174 EntryType _typedEntry; 0175 }; 0176 0177 class KIODevice : public QIODevice 0178 { 0179 public: 0180 enum Error { 0181 NoError, 0182 UnknownError, 0183 TimeoutError, 0184 UnknownHostError, 0185 MalformedUrlError, 0186 NotFoundError, 0187 }; 0188 0189 KIODevice(const QUrl &url) 0190 : _url(url) 0191 , _job(nullptr) 0192 , _error(NoError) 0193 { 0194 } 0195 0196 ~KIODevice() 0197 { 0198 close(); 0199 } 0200 0201 bool open() 0202 { 0203 if (_job) 0204 return false; 0205 0206 _job = KIO::storedGet(_url); 0207 QObject::connect(_job, &KIO::StoredTransferJob::result, _job, [&](KJob *) { 0208 if (_job->isErrorPage()) 0209 _eventLoop.exit(KIO::ERR_DOES_NOT_EXIST); 0210 else if (_job->error() != KJob::NoError) 0211 _eventLoop.exit(_job->error()); 0212 else 0213 _data = _job->data(); 0214 0215 _eventLoop.exit(KJob::NoError); 0216 }); 0217 0218 _eventLoop.exec(); 0219 switch (_job->error()) { 0220 case KJob::NoError: 0221 _error = NoError; 0222 setErrorString(QStringLiteral("")); 0223 QIODevice::open(QIODevice::ReadOnly | QIODevice::Unbuffered); 0224 break; 0225 case KJob::KilledJobError: 0226 _error = TimeoutError; 0227 break; 0228 case KIO::ERR_UNKNOWN_HOST: 0229 _error = UnknownHostError; 0230 break; 0231 case KIO::ERR_DOES_NOT_EXIST: 0232 _error = NotFoundError; 0233 break; 0234 case KIO::ERR_MALFORMED_URL: 0235 _error = MalformedUrlError; 0236 break; 0237 default: 0238 _error = UnknownError; 0239 break; 0240 } 0241 if (_error != NoError) { 0242 setErrorString(QStringLiteral("KIO: ") + _job->errorString()); 0243 delete _job; 0244 _job = nullptr; 0245 _data.clear(); 0246 } 0247 return _error == NoError; 0248 } 0249 bool open(OpenMode mode) override 0250 { 0251 Q_ASSERT(mode == QIODevice::ReadOnly); 0252 return open(); 0253 } 0254 void close() override 0255 { 0256 if (_job) { 0257 delete _job; 0258 _job = nullptr; 0259 _error = NoError; 0260 setErrorString(QStringLiteral("")); 0261 _data.clear(); 0262 QIODevice::close(); 0263 } 0264 } 0265 0266 qint64 size() const override 0267 { 0268 return _data.size(); 0269 } 0270 0271 int error() const 0272 { 0273 return _error; 0274 } 0275 void unsetError() 0276 { 0277 _error = NoError; 0278 } 0279 0280 protected: 0281 qint64 writeData(const char *, qint64) override 0282 { 0283 return -1; 0284 } 0285 qint64 readData(char *data, qint64 maxSize) override 0286 { 0287 Q_UNUSED(maxSize); 0288 Q_ASSERT(_job); 0289 Q_ASSERT(_job->error() == NoError); 0290 Q_ASSERT(data != nullptr); 0291 if (maxSize == 0 || pos() >= _data.length()) { 0292 return 0; 0293 } else if (pos() < _data.length()) { 0294 qint64 bytesToCopy = qMin(maxSize, _data.length() - pos()); 0295 memcpy(data, _data.data() + pos(), bytesToCopy); 0296 return bytesToCopy; 0297 } else { 0298 return -1; 0299 } 0300 } 0301 0302 private: 0303 QUrl _url; 0304 KIO::StoredTransferJob *_job; 0305 Error _error; 0306 QEventLoop _eventLoop; 0307 QByteArray _data; 0308 }; 0309 0310 struct CategoryProperty { 0311 enum Flag : uint32_t { 0312 Invalid = 0, 0313 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) sym = intVal, 0314 #include "properties.h" 0315 }; 0316 enum Group : uint32_t { 0317 #define CATEGORY_PROPERTY_GROUP(val, sym, intVal) sym = intVal, 0318 #include "properties.h" 0319 }; 0320 0321 CategoryProperty(uint32_t value = Unassigned) 0322 : _value(value) 0323 { 0324 } 0325 CategoryProperty(const QString &string) 0326 : _value(fromString(string)) 0327 { 0328 } 0329 operator uint32_t &() 0330 { 0331 return _value; 0332 } 0333 operator const uint32_t &() const 0334 { 0335 return _value; 0336 } 0337 bool isValid() const 0338 { 0339 return _value != Invalid; 0340 } 0341 0342 private: 0343 static uint32_t fromString(const QString &string) 0344 { 0345 static const QMap<QString, uint32_t> map = { 0346 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym}, 0347 #include "properties.h" 0348 }; 0349 return map.contains(string) ? map[string] : uint8_t(Invalid); 0350 } 0351 uint32_t _value; 0352 }; 0353 0354 struct EastAsianWidthProperty { 0355 enum Value : uint8_t { 0356 Invalid = 0x80, 0357 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) sym = intVal, 0358 #include "properties.h" 0359 }; 0360 0361 EastAsianWidthProperty(uint8_t value = Neutral) 0362 : _value(value) 0363 { 0364 } 0365 EastAsianWidthProperty(const QString &string) 0366 : _value(fromString(string)) 0367 { 0368 } 0369 operator uint8_t &() 0370 { 0371 return _value; 0372 } 0373 operator const uint8_t &() const 0374 { 0375 return _value; 0376 } 0377 bool isValid() const 0378 { 0379 return _value != Invalid; 0380 } 0381 0382 private: 0383 static uint8_t fromString(const QString &string) 0384 { 0385 static const QMap<QString, Value> map = { 0386 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), Value::sym}, 0387 #include "properties.h" 0388 }; 0389 return map.contains(string) ? map[string] : Invalid; 0390 } 0391 uint8_t _value; 0392 }; 0393 0394 struct EmojiProperty { 0395 enum Flag : uint8_t { 0396 Invalid = 0x80, 0397 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) sym = intVal, 0398 #include "properties.h" 0399 }; 0400 0401 EmojiProperty(uint8_t value = None) 0402 : _value(value) 0403 { 0404 } 0405 EmojiProperty(const QString &string) 0406 : _value(fromString(string)) 0407 { 0408 } 0409 operator uint8_t &() 0410 { 0411 return _value; 0412 } 0413 operator const uint8_t &() const 0414 { 0415 return _value; 0416 } 0417 bool isValid() const 0418 { 0419 return !(_value & Invalid); 0420 } 0421 0422 private: 0423 static uint8_t fromString(const QString &string) 0424 { 0425 static const QMap<QString, uint8_t> map = { 0426 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym}, 0427 #include "properties.h" 0428 }; 0429 return map.contains(string) ? map[string] : uint8_t(Invalid); 0430 } 0431 uint8_t _value; 0432 }; 0433 0434 struct CharacterWidth { 0435 enum Width : int8_t { 0436 Invalid = SCHAR_MIN, 0437 _VALID_START = -3, 0438 Ambiguous = -2, 0439 NonPrintable = -1, 0440 // 0 0441 // 1 0442 Unassigned = 1, 0443 // 2 0444 _VALID_END = 3, 0445 }; 0446 0447 CharacterWidth(const CharacterWidth &other) 0448 : _width(other._width) 0449 { 0450 } 0451 CharacterWidth(int8_t width = Invalid) 0452 : _width(width) 0453 { 0454 } 0455 CharacterWidth &operator=(const CharacterWidth &other) 0456 { 0457 _width = other._width; 0458 return *this; 0459 } 0460 int operator=(const int8_t width) 0461 { 0462 _width = width; 0463 return _width; 0464 } 0465 int width() const 0466 { 0467 return _width; 0468 } 0469 operator int() const 0470 { 0471 return width(); 0472 } 0473 0474 const QString toString() const 0475 { 0476 switch (_width) { 0477 case Ambiguous: 0478 return QStringLiteral("Ambiguous"); 0479 case NonPrintable: 0480 return QStringLiteral("NonPrintable"); 0481 case 0: 0482 return QStringLiteral("0"); 0483 case 1: 0484 return QStringLiteral("1"); 0485 case 2: 0486 return QStringLiteral("2"); 0487 default: 0488 case Invalid: 0489 return QStringLiteral("Invalid"); 0490 } 0491 } 0492 0493 bool isValid() const 0494 { 0495 return (_width > _VALID_START && _width < _VALID_END); 0496 }; 0497 0498 private: 0499 int8_t _width; 0500 }; 0501 0502 struct CharacterProperties { 0503 CategoryProperty category; 0504 EastAsianWidthProperty eastAsianWidth; 0505 EmojiProperty emoji; 0506 CharacterWidth customWidth; 0507 // For debug purposes in "details" output generator 0508 uint8_t widthFromPropsRule; 0509 }; 0510 0511 struct UnicodeDataEntry : public UcdEntry { 0512 enum FieldId { 0513 NameId = 0, 0514 CategoryId = 1, 0515 }; 0516 CategoryProperty category() const 0517 { 0518 return CategoryProperty(this->fields.value(CategoryId)); 0519 } 0520 }; 0521 0522 struct EastAsianWidthEntry : public UcdEntry { 0523 enum FieldId { 0524 WidthId = 0, 0525 }; 0526 EastAsianWidthProperty eastAsianWidth() const 0527 { 0528 return EastAsianWidthProperty(this->fields.value(WidthId)); 0529 } 0530 }; 0531 0532 struct EmojiDataEntry : public UcdEntry { 0533 enum FieldId { 0534 EmojiId = 0, 0535 }; 0536 EmojiProperty emoji() const 0537 { 0538 return EmojiProperty(this->fields.value(EmojiId)); 0539 } 0540 }; 0541 0542 struct GenericWidthEntry : public UcdEntry { 0543 enum FieldId { 0544 WidthId = 0, 0545 }; 0546 CharacterWidth width() const 0547 { 0548 bool ok; 0549 CharacterWidth w = this->fields.value(WidthId).toInt(&ok, 10); 0550 return (ok && w.isValid()) ? w : CharacterWidth::Invalid; 0551 } 0552 }; 0553 0554 struct WidthsRange { 0555 struct { 0556 uint first; 0557 uint last; 0558 } cp; 0559 CharacterWidth width; 0560 }; 0561 0562 QVector<WidthsRange> rangesFromWidths(const QVector<CharacterWidth> &widths, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM}) 0563 { 0564 QVector<WidthsRange> ranges; 0565 0566 if (ucsRange.second >= CODE_POINTS_NUM) 0567 ucsRange.second = widths.size() - 1; 0568 0569 uint first = ucsRange.first; 0570 for (uint cp = first + 1; cp <= uint(ucsRange.second); ++cp) { 0571 if (widths[first] != widths[cp]) { 0572 ranges.append({{first, cp - 1}, widths[cp - 1]}); 0573 first = cp; 0574 } 0575 } 0576 ranges.append({{first, uint(ucsRange.second)}, widths[ucsRange.second]}); 0577 0578 return ranges; 0579 } 0580 0581 // Real ranges look like this (each continuous letter sequence is a range): 0582 // 0583 // D D D D D D D D 8 ranges 0584 // C C C C C C CC C CC 9 ranges 0585 // BBB BBB B B BBB BBBBBB 6 ranges 0586 // A A A A 4 ranges 0587 // ∑: 27 ranges 0588 // 0589 // To reduce total ranges count, the holes in groups can be filled with ranges 0590 // from groups above them: 0591 // 0592 // D D D D D D D D 8 ranges 0593 // CCC C CCCCC CCCCCCC 4 ranges 0594 // BBBBBBB BBBBBBB BBBBBBBBBBBBBBBB 3 ranges 0595 // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 1 ranges 0596 // ∑: 16 ranges 0597 // 0598 // First range is always without change. Last range (A) can be dropped 0599 // (it always contains everything). Search should be done in order: D, C, B (A). 0600 // For simplicity the function returns all ranges, including first and last. 0601 QMap<CharacterWidth, QVector<QPair<uint, uint>>> 0602 mergedRangesFromWidths(const QVector<CharacterWidth> &widths, const QVector<CharacterWidth> widthsSortOrder, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM}) 0603 { 0604 if (ucsRange.second >= CODE_POINTS_NUM) 0605 ucsRange.second = widths.size() - 1; 0606 QVector<WidthsRange> ranges = rangesFromWidths(widths, ucsRange); 0607 QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges; 0608 0609 int cmwi; // Currently Merged Width Index 0610 int sri = -1; // Start Range Index (for current width) 0611 int cri; // Current Range Index 0612 0613 // First width ranges are without change. Last one has one range spanning everything, so we can skip this 0614 for (cmwi = 1; cmwi < widthsSortOrder.size() - 1; ++cmwi) { 0615 const CharacterWidth &cmw = widthsSortOrder[cmwi]; // Currently Merged Width 0616 for (cri = 0; cri < ranges.size(); ++cri) { 0617 WidthsRange &cr = ranges[cri]; // Current Range 0618 if (cr.width == cmw) { 0619 // Range is suitable for merge 0620 if (sri < 0) { 0621 // First one, just remember it 0622 sri = cri; 0623 } else { 0624 // Merge 0625 ranges[sri].cp.last = cr.cp.last; 0626 cr.width = CharacterWidth::Invalid; 0627 } 0628 } else { 0629 // Current range has another width - can we continue merging? 0630 if (sri >= 0) { 0631 const int crwi = widthsSortOrder.indexOf(cr.width); // Current Range Width Index 0632 if (!(crwi < cmwi && crwi >= 0)) { 0633 // current range is not above currently merged width - stop merging 0634 sri = -1; 0635 } 0636 } 0637 } 0638 } 0639 } 0640 0641 for (const auto &range : std::as_const(ranges)) { 0642 if (range.width.isValid() && range.width != widthsSortOrder.last()) 0643 mergedRanges[range.width].append({range.cp.first, range.cp.last}); 0644 } 0645 mergedRanges[widthsSortOrder.last()].append({ucsRange.first, ucsRange.second}); 0646 0647 return mergedRanges; 0648 } 0649 0650 namespace generators 0651 { 0652 using GeneratorFunc = bool (*)(QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &); 0653 0654 bool code(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args) 0655 { 0656 static constexpr int DIRECT_LUT_SIZE = 256; 0657 0658 Q_UNUSED(props); 0659 QTextStream eout(stderr, QIODevice::WriteOnly); 0660 0661 if (args.value(QStringLiteral("param")).isEmpty()) { 0662 eout << QStringLiteral("Template file not specified.") << Qt::endl << Qt::endl; 0663 return false; 0664 } 0665 QFile templateFile(args.value(QStringLiteral("param"))); 0666 if (!templateFile.open(QIODevice::ReadOnly)) { 0667 eout << QStringLiteral("Could not open file ") << templateFile.fileName() << ": " << templateFile.errorString(); 0668 exit(1); 0669 } 0670 0671 const QString templateText = QString::fromUtf8(templateFile.readAll()); 0672 templateFile.close(); 0673 0674 Var::Map data = { 0675 {QStringLiteral("gen-file-warning"), QStringLiteral("THIS IS A GENERATED FILE. DO NOT EDIT.")}, 0676 {QStringLiteral("cmdline"), args.value(QStringLiteral("cmdline"))}, 0677 {QStringLiteral("direct-lut"), Var::Vector(DIRECT_LUT_SIZE)}, 0678 {QStringLiteral("direct-lut-size"), DIRECT_LUT_SIZE}, 0679 {QStringLiteral("ranges-luts"), Var::Vector()}, 0680 {QStringLiteral("ranges-lut-list"), Var::Vector()}, 0681 {QStringLiteral("ranges-lut-list-size"), 0}, 0682 }; 0683 0684 // Fill direct-lut with widths of 0x00-0xFF 0685 for (unsigned i = 0; i < DIRECT_LUT_SIZE; ++i) { 0686 Q_ASSERT(widths[i].isValid()); 0687 data[QStringLiteral("direct-lut")].vec[i] = int(widths[i]); 0688 } 0689 0690 static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1}; 0691 const QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder, {DIRECT_LUT_SIZE, CODE_POINTS_NUM}); 0692 0693 // Find last non-empty ranges lut 0694 int lastWidthId = 0; 0695 for (int wi = widthsSortOrder.size() - 1; wi > 0; --wi) { 0696 if (mergedRanges.contains(widthsSortOrder[wi])) { 0697 lastWidthId = wi; 0698 break; 0699 } 0700 } 0701 // Create ranges-luts for all widths except last non-empty one and empty ones 0702 for (int wi = 0; lastWidthId != 0 && wi < lastWidthId; ++wi) { 0703 const CharacterWidth width = widthsSortOrder[wi]; 0704 auto currentMergedRangesIt = mergedRanges.find(width); 0705 if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty()) 0706 continue; 0707 const int size = mergedRanges[width].size(); 0708 const QString name = QString(QStringLiteral("LUT_%1")).arg(width.toString().toUpper()); 0709 data[QStringLiteral("ranges-luts")].vec.append(Var::Map{ 0710 {QStringLiteral("name"), name}, 0711 {QStringLiteral("ranges"), Var::Vector()}, 0712 {QStringLiteral("size"), size}, 0713 }); 0714 data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{ 0715 {QStringLiteral("width"), int(width)}, 0716 {QStringLiteral("name"), name}, 0717 {QStringLiteral("size"), size}, 0718 }); 0719 auto ¤tLut = data[QStringLiteral("ranges-luts")].vec.last()[QStringLiteral("ranges")].vec; 0720 for (const auto &range : *currentMergedRangesIt) { 0721 Q_ASSERT(range.first <= LAST_CODE_POINT); 0722 Q_ASSERT(range.second <= LAST_CODE_POINT); 0723 currentLut.append(Var(Var::Map{{QStringLiteral("first"), range.first}, {QStringLiteral("last"), range.second}})); 0724 } 0725 } 0726 data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{ 0727 {QStringLiteral("width"), widthsSortOrder[lastWidthId].width()}, 0728 {QStringLiteral("name"), QStringLiteral("nullptr")}, 0729 {QStringLiteral("size"), 1}, 0730 }); 0731 data[QStringLiteral("ranges-lut-list-size")] = mergedRanges.size(); 0732 0733 Template t(templateText); 0734 t.parse(); 0735 out << t.generate(data); 0736 0737 return true; 0738 } 0739 0740 bool list(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args) 0741 { 0742 Q_UNUSED(props); 0743 0744 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); 0745 for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) { 0746 out << QString::asprintf("%06X ; %2d\n", cp, int(widths[cp])); 0747 } 0748 0749 return true; 0750 } 0751 0752 bool ranges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args) 0753 { 0754 Q_UNUSED(props); 0755 const auto ranges = rangesFromWidths(widths); 0756 0757 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); 0758 for (const WidthsRange &range : ranges) { 0759 if (range.cp.first != range.cp.last) 0760 out << QString::asprintf("%06X..%06X ; %2d\n", range.cp.first, range.cp.last, int(range.width)); 0761 else 0762 out << QString::asprintf("%06X ; %2d\n", range.cp.first, int(range.width)); 0763 } 0764 0765 return true; 0766 } 0767 0768 bool compactRanges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args) 0769 { 0770 Q_UNUSED(props); 0771 static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1}; 0772 const auto mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder); 0773 0774 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); 0775 for (const int width : std::as_const(widthsSortOrder)) { 0776 const auto currentMergedRangesIt = mergedRanges.find(width); 0777 if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty()) 0778 continue; 0779 for (const auto &range : currentMergedRangesIt.value()) { 0780 if (range.first != range.second) 0781 out << QString::asprintf("%06X..%06X ; %2d\n", range.first, range.second, int(width)); 0782 else 0783 out << QString::asprintf("%06X ; %2d\n", range.first, int(width)); 0784 } 0785 } 0786 0787 return true; 0788 } 0789 0790 bool details(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args) 0791 { 0792 out.setFieldAlignment(QTextStream::AlignLeft); 0793 0794 out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n"); 0795 out << QString::asprintf("#%-5s ; %-4s ; %-8s ; %-3s ; %-2s ; %-4s ; %-4s\n", "CP", "Wdth", "Cat", "EAW", "EM", "CstW", "Rule"); 0796 QMap<CharacterWidth, uint> widthStats; 0797 for (uint cp = 0; cp <= LAST_CODE_POINT; ++cp) { 0798 out << QString::asprintf("%06X ; %4d ; %08X ; %02X ; %02X ; %4d ; %d\n", 0799 cp, 0800 int8_t(widths[cp]), 0801 uint32_t(props[cp].category), 0802 uint8_t(props[cp].eastAsianWidth), 0803 uint8_t(props[cp].emoji), 0804 int8_t(props[cp].customWidth), 0805 props[cp].widthFromPropsRule); 0806 if (!widthStats.contains(widths[cp])) 0807 widthStats.insert(widths[cp], 0); 0808 widthStats[widths[cp]]++; 0809 } 0810 QMap<CharacterWidth, uint> rangesStats; 0811 const auto ranges = rangesFromWidths(widths); 0812 for (const auto &range : ranges) { 0813 if (!rangesStats.contains(range.width)) 0814 rangesStats.insert(range.width, 0); 0815 rangesStats[range.width]++; 0816 } 0817 out << QStringLiteral("# STATS") << Qt::endl; 0818 out << QStringLiteral("#") << Qt::endl; 0819 out << QStringLiteral("# Characters count for each width:") << Qt::endl; 0820 for (auto wi = widthStats.constBegin(); wi != widthStats.constEnd(); ++wi) { 0821 out << QString::asprintf("# %2d: %7d\n", int(wi.key()), widthStats[wi.key()]); 0822 } 0823 out << QStringLiteral("#") << Qt::endl; 0824 out << QStringLiteral("# Ranges count for each width:") << Qt::endl; 0825 int howmany = 0; 0826 for (auto wi = rangesStats.constBegin(); wi != rangesStats.constEnd(); ++wi) { 0827 if (howmany >= 20) 0828 break; 0829 howmany++; 0830 out << QString::asprintf("# %2d: %7d\n", int(wi.key()), rangesStats[wi.key()]); 0831 } 0832 0833 return true; 0834 } 0835 } // namespace generators 0836 0837 template<class EntryType> 0838 static void processInputFiles(QVector<CharacterProperties> &props, 0839 const QStringList &files, 0840 const QString &fileTypeName, 0841 void (*cb)(CharacterProperties &prop, const EntryType &entry)) 0842 { 0843 static const QRegularExpression PROTOCOL_RE(QStringLiteral(R"#(^[a-z]+://)#")); 0844 for (const QString &fileName : files) { 0845 qInfo().noquote() << QStringLiteral("Parsing as %1: %2").arg(fileTypeName).arg(fileName); 0846 QSharedPointer<QIODevice> source = nullptr; 0847 if (PROTOCOL_RE.match(fileName).hasMatch()) { 0848 source.reset(new KIODevice(QUrl(fileName))); 0849 } else { 0850 source.reset(new QFile(fileName)); 0851 } 0852 0853 if (!source->open(QIODevice::ReadOnly)) { 0854 qCritical() << QStringLiteral("Could not open %1: %2").arg(fileName).arg(source->errorString()); 0855 exit(1); 0856 } 0857 UcdParser<EntryType> p(source.data()); 0858 while (p.hasNext()) { 0859 const auto &e = p.next(); 0860 for (uint cp = e.cp.first; cp <= e.cp.last; ++cp) { 0861 cb(props[cp], e); 0862 } 0863 } 0864 } 0865 } 0866 0867 static const QString escapeCmdline(const QStringList &args) 0868 { 0869 static QString cmdline = QString(); 0870 if (!cmdline.isEmpty()) 0871 return cmdline; 0872 0873 QTextStream stream(&cmdline, QIODevice::WriteOnly); 0874 0875 // basename for command name 0876 stream << QFileInfo(args[0]).baseName(); 0877 for (auto it = args.begin() + 1; it != args.end(); ++it) { 0878 if (!it->startsWith(QLatin1Char('-'))) 0879 stream << QStringLiteral(" \"") << QString(*it).replace(QRegularExpression(QStringLiteral(R"(["`$\\])")), QStringLiteral(R"(\\\1)")) << '"'; 0880 else 0881 stream << ' ' << *it; 0882 } 0883 stream.flush(); 0884 return cmdline; 0885 } 0886 0887 enum ConvertOptions { 0888 AmbiguousWidthOpt = 0, 0889 EmojiOpt = 1, 0890 }; 0891 0892 // Character width assignment 0893 // 0894 // Rules (from highest to lowest priority): 0895 // 0896 // * Local overlay 0897 // * (not implemented) Character unique properties described in The Unicode Standard, Version 10.0 0898 // * Unicode category Cc, Cs: -1 0899 // * Emoji: 2 0900 // * Unicode category Mn, Me, Cf: 0 0901 // * East Asian Width W, F: 2 0902 // * East Asian Width H, N, Na: 1 0903 // * East Asian Width A: (varies) 0904 // * Unassigned/Undefined/Private Use: 1 0905 // 0906 // The list is loosely based on character width implementations in Vim 8.1 0907 // and glibc 2.27. There are a few cases which could look better 0908 // (decomposed Hangul, emoji with modifiers, etc) with different widths, 0909 // but interactive terminal programs (at least vim, zsh, everything based 0910 // on glibc's wcwidth) would see their width as it is implemented now. 0911 static inline CharacterWidth widthFromProps(const CharacterProperties &props, uint cp, const QMap<ConvertOptions, int> &convertOpts) 0912 { 0913 CharacterWidth cw; 0914 auto &widthFromPropsRule = const_cast<uint8_t &>(props.widthFromPropsRule); 0915 if (props.customWidth.isValid()) { 0916 widthFromPropsRule = 1; 0917 cw = props.customWidth; 0918 0919 } else if ((CategoryProperty::Control | CategoryProperty::Surrogate) & props.category) { 0920 widthFromPropsRule = 2; 0921 cw = CharacterWidth::NonPrintable; 0922 0923 } else if (convertOpts[EmojiOpt] & props.emoji && !(EmojiProperty::EmojiComponent & props.emoji)) { 0924 widthFromPropsRule = 3; 0925 cw = 2; 0926 0927 } else if ((CategoryProperty::NonspacingMark | CategoryProperty::EnclosingMark | CategoryProperty::Format) & props.category) { 0928 widthFromPropsRule = 4; 0929 cw = 0; 0930 0931 } else if ((EastAsianWidthProperty::Wide | EastAsianWidthProperty::Fullwidth) & props.eastAsianWidth) { 0932 widthFromPropsRule = 5; 0933 cw = 2; 0934 0935 } else if ((EastAsianWidthProperty::Halfwidth | EastAsianWidthProperty::Neutral | EastAsianWidthProperty::Narrow) & props.eastAsianWidth) { 0936 widthFromPropsRule = 6; 0937 cw = 1; 0938 0939 } else if ((CategoryProperty::Unassigned | CategoryProperty::PrivateUse) & props.category) { 0940 widthFromPropsRule = 7; 0941 cw = CharacterWidth::Unassigned; 0942 0943 } else if ((EastAsianWidthProperty::Ambiguous)&props.eastAsianWidth) { 0944 widthFromPropsRule = 8; 0945 cw = convertOpts[AmbiguousWidthOpt]; 0946 0947 } else if (!props.category.isValid()) { 0948 widthFromPropsRule = 9; 0949 qWarning() << QStringLiteral("Code point U+%1 has invalid category - this should not happen. Assuming \"unassigned\"").arg(cp, 4, 16, QLatin1Char('0')); 0950 cw = CharacterWidth::Unassigned; 0951 0952 } else { 0953 widthFromPropsRule = 10; 0954 qWarning() 0955 << QStringLiteral("Code point U+%1 not classified - this should not happen. Assuming non-printable character").arg(cp, 4, 16, QLatin1Char('0')); 0956 cw = CharacterWidth::NonPrintable; 0957 } 0958 0959 return cw; 0960 } 0961 0962 int main(int argc, char *argv[]) 0963 { 0964 static const QMap<QString, generators::GeneratorFunc> GENERATOR_FUNCS_MAP = { 0965 {QStringLiteral("code"), generators::code}, 0966 {QStringLiteral("compact-ranges"), generators::compactRanges}, 0967 {QStringLiteral("ranges"), generators::ranges}, 0968 {QStringLiteral("list"), generators::list}, 0969 {QStringLiteral("details"), generators::details}, 0970 {QStringLiteral("dummy"), 0971 [](QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &) -> bool { 0972 return true; 0973 }}, 0974 }; 0975 qSetMessagePattern(QStringLiteral("%{message}")); 0976 0977 QCoreApplication app(argc, argv); 0978 QCommandLineParser parser; 0979 parser.setApplicationDescription(QStringLiteral("\nUCD files to characters widths converter.\n")); 0980 parser.addHelpOption(); 0981 parser.addOptions({ 0982 {{QStringLiteral("U"), QStringLiteral("unicode-data")}, QStringLiteral("Path or URL to UnicodeData.txt."), QStringLiteral("URL|file")}, 0983 {{QStringLiteral("A"), QStringLiteral("east-asian-width")}, QStringLiteral("Path or URL to EastAsianWidth.txt."), QStringLiteral("URL|file")}, 0984 {{QStringLiteral("E"), QStringLiteral("emoji-data")}, QStringLiteral("Path or URL to emoji-data.txt."), QStringLiteral("URL|file")}, 0985 {{QStringLiteral("W"), QStringLiteral("generic-width")}, 0986 QStringLiteral("Path or URL to generic file with width data. Accepts output from compact-ranges, ranges, list and details generator."), 0987 QStringLiteral("URL|file")}, 0988 0989 {QStringLiteral("ambiguous-width"), 0990 QStringLiteral("Ambiguous characters width."), 0991 QStringLiteral("separate|1|2"), 0992 QString(QStringLiteral("%1")).arg(CharacterWidth::Ambiguous)}, 0993 {QStringLiteral("emoji"), 0994 QStringLiteral("Which emoji emoji subset is treated as emoji."), 0995 QStringLiteral("all|presentation"), 0996 QStringLiteral("presentation")}, 0997 0998 {{QStringLiteral("g"), QStringLiteral("generator")}, 0999 QStringLiteral("Output generator (use \"-\" to list available generators). The code generator requires path to a template file."), 1000 QStringLiteral("generator[:template]"), 1001 QStringLiteral("details")}, 1002 }); 1003 parser.addPositionalArgument(QStringLiteral("output"), QStringLiteral("Output file (leave empty for stdout).")); 1004 parser.process(app); 1005 1006 const QStringList unicodeDataFiles = parser.values(QStringLiteral("unicode-data")); 1007 const QStringList eastAsianWidthFiles = parser.values(QStringLiteral("east-asian-width")); 1008 const QStringList emojiDataFiles = parser.values(QStringLiteral("emoji-data")); 1009 const QStringList genericWidthFiles = parser.values(QStringLiteral("generic-width")); 1010 const QString ambiguousWidthStr = parser.value(QStringLiteral("ambiguous-width")); 1011 const QString emojiStr = parser.value(QStringLiteral("emoji")); 1012 const QString generator = parser.value(QStringLiteral("generator")); 1013 const QString outputFileName = parser.positionalArguments().value(0); 1014 1015 QTextStream eout(stderr, QIODevice::WriteOnly); 1016 if (unicodeDataFiles.isEmpty() && eastAsianWidthFiles.isEmpty() && emojiDataFiles.isEmpty() && genericWidthFiles.isEmpty()) { 1017 eout << QStringLiteral("Input files not specified.") << Qt::endl << Qt::endl; 1018 parser.showHelp(1); 1019 } 1020 1021 static QMap<ConvertOptions, int> convertOpts = { 1022 {AmbiguousWidthOpt, CharacterWidth::Ambiguous}, 1023 {EmojiOpt, EmojiProperty::EmojiPresentation}, 1024 }; 1025 1026 if (emojiStr == QStringLiteral("presentation")) 1027 convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation; 1028 else if (emojiStr == QStringLiteral("all")) 1029 convertOpts[EmojiOpt] = EmojiProperty::Emoji; 1030 else { 1031 convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation; 1032 qWarning() << QStringLiteral("invalid emoji option value: %1. Assuming \"presentation\".").arg(emojiStr); 1033 } 1034 1035 if (ambiguousWidthStr == QStringLiteral("separate")) 1036 convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous; 1037 else if (ambiguousWidthStr == QStringLiteral("1")) 1038 convertOpts[AmbiguousWidthOpt] = 1; 1039 else if (ambiguousWidthStr == QStringLiteral("2")) 1040 convertOpts[AmbiguousWidthOpt] = 2; 1041 else { 1042 convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous; 1043 qWarning() << QStringLiteral("Invalid ambiguous-width option value: %1. Assuming \"separate\".").arg(emojiStr); 1044 } 1045 1046 const int sepPos = generator.indexOf(QLatin1Char(':')); 1047 const auto generatorName = generator.left(sepPos); 1048 const auto generatorParam = sepPos >= 0 ? generator.mid(sepPos + 1) : QString(); 1049 1050 if (!GENERATOR_FUNCS_MAP.contains(generatorName)) { 1051 int status = 0; 1052 if (generatorName != QStringLiteral("-")) { 1053 status = 1; 1054 eout << QStringLiteral("Invalid output generator. Available generators:") << Qt::endl; 1055 } 1056 1057 for (auto it = GENERATOR_FUNCS_MAP.constBegin(); it != GENERATOR_FUNCS_MAP.constEnd(); ++it) { 1058 eout << it.key() << Qt::endl; 1059 } 1060 exit(status); 1061 } 1062 auto generatorFunc = GENERATOR_FUNCS_MAP[generatorName]; 1063 1064 QFile outFile; 1065 if (!outputFileName.isEmpty()) { 1066 outFile.setFileName(outputFileName); 1067 if (!outFile.open(QIODevice::WriteOnly)) { 1068 eout << QStringLiteral("Could not open file ") << outputFileName << QStringLiteral(": ") << outFile.errorString() << Qt::endl; 1069 exit(1); 1070 } 1071 } else { 1072 outFile.open(stdout, QIODevice::WriteOnly); 1073 } 1074 QTextStream out(&outFile); 1075 1076 QVector<CharacterProperties> props(CODE_POINTS_NUM); 1077 1078 processInputFiles<UnicodeDataEntry>(props, 1079 unicodeDataFiles, 1080 QStringLiteral("UnicodeData.txt"), 1081 [](CharacterProperties &prop, const UnicodeDataEntry &entry) { 1082 prop.category = entry.category(); 1083 }); 1084 1085 processInputFiles<EastAsianWidthEntry>(props, 1086 eastAsianWidthFiles, 1087 QStringLiteral("EastAsianWidth.txt"), 1088 [](CharacterProperties &prop, const EastAsianWidthEntry &entry) { 1089 prop.eastAsianWidth = entry.eastAsianWidth(); 1090 }); 1091 1092 processInputFiles<EmojiDataEntry>(props, emojiDataFiles, QStringLiteral("emoji-data.txt"), [](CharacterProperties &prop, const EmojiDataEntry &entry) { 1093 prop.emoji |= entry.emoji(); 1094 }); 1095 1096 processInputFiles<GenericWidthEntry>(props, 1097 genericWidthFiles, 1098 QStringLiteral("generic width data"), 1099 [](CharacterProperties &prop, const GenericWidthEntry &entry) { 1100 prop.customWidth = entry.width(); 1101 }); 1102 1103 qInfo() << "Generating character width data"; 1104 QVector<CharacterWidth> widths(CODE_POINTS_NUM); 1105 widths[0] = 0; // NULL character always has width 0 1106 for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) { 1107 widths[cp] = widthFromProps(props[cp], cp, convertOpts); 1108 } 1109 1110 const QMap<QString, QString> generatorArgs = { 1111 {QStringLiteral("cmdline"), escapeCmdline(app.arguments())}, 1112 {QStringLiteral("param"), generatorParam}, 1113 {QStringLiteral("output"), outputFileName.isEmpty() ? QStringLiteral("<stdout>") : outputFileName}, 1114 }; 1115 1116 qInfo() << "Generating output"; 1117 if (!generatorFunc(out, props, widths, generatorArgs)) { 1118 parser.showHelp(1); 1119 } 1120 1121 return 0; 1122 }