File indexing completed on 2024-05-05 05:54:13

0001 /*
0002     This file is part of Konsole, a terminal emulator for KDE.
0003 
0004     SPDX-FileCopyrightText: 2018 Mariusz Glebocki <mglb@arccos-1.net>
0005 
0006     SPDX-License-Identifier: GPL-2.0-or-later
0007 */
0008 
0009 #include "template.h"
0010 #include <QCommandLineParser>
0011 #include <QCoreApplication>
0012 #include <QEventLoop>
0013 #include <QFile>
0014 #include <QFileInfo>
0015 #include <QLoggingCategory>
0016 #include <QMap>
0017 #include <QRegularExpression>
0018 #include <QRegularExpressionMatch>
0019 #include <QString>
0020 #include <QTextStream>
0021 
0022 #include <KIO/Job>
0023 
0024 static constexpr unsigned int CODE_POINTS_NUM = 0x110000;
0025 static constexpr unsigned int LAST_CODE_POINT = CODE_POINTS_NUM - 1;
0026 
0027 struct UcdEntry {
0028     struct {
0029         uint first;
0030         uint last;
0031     } cp;
0032     QStringList fields;
0033 };
0034 
0035 class UcdParserBase
0036 {
0037 public:
0038     ~UcdParserBase()
0039     {
0040         _source->close();
0041     }
0042 
0043     bool hasNext()
0044     {
0045         bool hadNext = _hasNext;
0046         if (!_nextFetched) {
0047             _hasNext = fetchNext();
0048             _nextFetched = true;
0049         }
0050         return hadNext;
0051     }
0052 
0053 protected:
0054     UcdParserBase(QIODevice *source, UcdEntry *entry)
0055         : _source(source)
0056         , _nextFetched(false)
0057         , _hasNext(true)
0058         , _lineNo(0)
0059         , _entry(entry)
0060     {
0061         Q_ASSERT(_source);
0062         Q_ASSERT(_entry);
0063     }
0064 
0065     bool fetchNext()
0066     {
0067         Q_ASSERT(_source->isOpen());
0068         if (!_source->isOpen())
0069             return false;
0070 
0071         static const QRegularExpression ENTRY_RE = QRegularExpression(QStringLiteral(
0072             // Match 1: "cp1" - first CP / "cp2" (optional) - last CP
0073             R"#((?:^(?<cp1>[[:xdigit:]]+)(?:\.\.(?<cp2>[[:xdigit:]]+))?[ \t]*;)#"
0074             // Match 1: "field0" - first data field"
0075             //          "udRangeInd" (UnicodeData.txt only) - if present, the line is either first or last line of a range
0076             R"#([ \t]*(?<field0>[^#;\n]*?(?:, (?<udRangeInd>First|Last)>)?)[ \t]*(?:;|(?:\#.*)?$))|)#"
0077             // Match 2..n: "field" - n-th field
0078             R"#((?:\G(?<=;)[ \t]*(?<field>[^#;\n]*?)[ \t]*(?:;|(?:#.*)?$)))#"));
0079         static const QRegularExpression UD_RANGE_IND_RE(QStringLiteral(", (First|Last)"));
0080         static const QRegularExpression COMMENT_RE(QStringLiteral("^[ \t]*(#.*)?$"));
0081 
0082         QString line;
0083         bool ok;
0084         _entry->fields.clear();
0085         while (!_source->atEnd()) {
0086             line = QString::fromUtf8(_source->readLine());
0087             _lineNo++;
0088             auto mit = ENTRY_RE.globalMatch(line);
0089             if (!mit.hasNext()) {
0090                 // Do not complain about comments and empty lines
0091                 if (!COMMENT_RE.match(line).hasMatch())
0092                     qDebug() << QStringLiteral("Line %1: does not match - skipping").arg(_lineNo);
0093                 continue;
0094             }
0095 
0096             auto match = mit.next();
0097             _entry->cp.first = match.captured(QStringLiteral("cp1")).toUInt(&ok, 16);
0098             if (!ok) {
0099                 qDebug() << QStringLiteral("Line %d Invalid cp1 - skipping").arg(_lineNo);
0100                 continue;
0101             }
0102             _entry->cp.last = match.captured(QStringLiteral("cp2")).toUInt(&ok, 16);
0103             if (!ok) {
0104                 _entry->cp.last = _entry->cp.first;
0105             }
0106             QString field0 = match.captured(QStringLiteral("field0"));
0107             if (field0.isNull()) {
0108                 qDebug() << QStringLiteral("Line %d: Missing field0 - skipping").arg(_lineNo);
0109                 continue;
0110             }
0111             if (!match.captured(QStringLiteral("udRangeInd")).isNull()) {
0112                 if (match.captured(QStringLiteral("udRangeInd")) == QStringLiteral("First")) {
0113                     // Fetch next valid line, as it pairs with the current one to form a range
0114                     QRegularExpressionMatch nlMatch;
0115                     int firstLineNo = _lineNo;
0116                     while (!_source->atEnd() && !nlMatch.hasMatch()) {
0117                         line = QString::fromUtf8(_source->readLine());
0118                         _lineNo++;
0119                         nlMatch = ENTRY_RE.match(line);
0120                         if (!nlMatch.hasMatch()) {
0121                             qDebug() << QStringLiteral("Line %d: does not match - skipping").arg(_lineNo);
0122                         }
0123                     }
0124                     if (nlMatch.hasMatch()) {
0125                         _entry->cp.last = nlMatch.captured(QStringLiteral("cp1")).toUInt(&ok, 16);
0126                         if (!ok) {
0127                             qDebug() << QStringLiteral("Line %1-%2: Missing or invalid second cp1 (\"Last\" entry) - skipping").arg(firstLineNo).arg(_lineNo);
0128                             continue;
0129                         }
0130                     }
0131                 }
0132                 field0.remove(UD_RANGE_IND_RE);
0133             }
0134             _entry->fields.append(field0);
0135 
0136             while (mit.hasNext()) {
0137                 _entry->fields.append(mit.next().captured(QStringLiteral("field")));
0138             }
0139 
0140             return !_source->atEnd();
0141         }
0142         return false;
0143     }
0144 
0145     QIODevice *_source;
0146     bool _nextFetched;
0147     bool _hasNext;
0148 
0149 private:
0150     int _lineNo;
0151     UcdEntry *_entry;
0152 };
0153 
0154 template<class EntryType>
0155 class UcdParser : public UcdParserBase
0156 {
0157 public:
0158     static_assert(std::is_base_of<UcdEntry, EntryType>::value, "'EntryType' has to be derived from UcdParser::Entry");
0159 
0160     UcdParser(QIODevice *source)
0161         : UcdParserBase(source, &_typedEntry)
0162     {
0163     }
0164 
0165     inline const EntryType &next()
0166     {
0167         if (!_nextFetched)
0168             fetchNext();
0169         _nextFetched = false;
0170         return _typedEntry;
0171     }
0172 
0173 private:
0174     EntryType _typedEntry;
0175 };
0176 
0177 class KIODevice : public QIODevice
0178 {
0179 public:
0180     enum Error {
0181         NoError,
0182         UnknownError,
0183         TimeoutError,
0184         UnknownHostError,
0185         MalformedUrlError,
0186         NotFoundError,
0187     };
0188 
0189     KIODevice(const QUrl &url)
0190         : _url(url)
0191         , _job(nullptr)
0192         , _error(NoError)
0193     {
0194     }
0195 
0196     ~KIODevice()
0197     {
0198         close();
0199     }
0200 
0201     bool open()
0202     {
0203         if (_job)
0204             return false;
0205 
0206         _job = KIO::storedGet(_url);
0207         QObject::connect(_job, &KIO::StoredTransferJob::result, _job, [&](KJob *) {
0208             if (_job->isErrorPage())
0209                 _eventLoop.exit(KIO::ERR_DOES_NOT_EXIST);
0210             else if (_job->error() != KJob::NoError)
0211                 _eventLoop.exit(_job->error());
0212             else
0213                 _data = _job->data();
0214 
0215             _eventLoop.exit(KJob::NoError);
0216         });
0217 
0218         _eventLoop.exec();
0219         switch (_job->error()) {
0220         case KJob::NoError:
0221             _error = NoError;
0222             setErrorString(QStringLiteral(""));
0223             QIODevice::open(QIODevice::ReadOnly | QIODevice::Unbuffered);
0224             break;
0225         case KJob::KilledJobError:
0226             _error = TimeoutError;
0227             break;
0228         case KIO::ERR_UNKNOWN_HOST:
0229             _error = UnknownHostError;
0230             break;
0231         case KIO::ERR_DOES_NOT_EXIST:
0232             _error = NotFoundError;
0233             break;
0234         case KIO::ERR_MALFORMED_URL:
0235             _error = MalformedUrlError;
0236             break;
0237         default:
0238             _error = UnknownError;
0239             break;
0240         }
0241         if (_error != NoError) {
0242             setErrorString(QStringLiteral("KIO: ") + _job->errorString());
0243             delete _job;
0244             _job = nullptr;
0245             _data.clear();
0246         }
0247         return _error == NoError;
0248     }
0249     bool open(OpenMode mode) override
0250     {
0251         Q_ASSERT(mode == QIODevice::ReadOnly);
0252         return open();
0253     }
0254     void close() override
0255     {
0256         if (_job) {
0257             delete _job;
0258             _job = nullptr;
0259             _error = NoError;
0260             setErrorString(QStringLiteral(""));
0261             _data.clear();
0262             QIODevice::close();
0263         }
0264     }
0265 
0266     qint64 size() const override
0267     {
0268         return _data.size();
0269     }
0270 
0271     int error() const
0272     {
0273         return _error;
0274     }
0275     void unsetError()
0276     {
0277         _error = NoError;
0278     }
0279 
0280 protected:
0281     qint64 writeData(const char *, qint64) override
0282     {
0283         return -1;
0284     }
0285     qint64 readData(char *data, qint64 maxSize) override
0286     {
0287         Q_UNUSED(maxSize);
0288         Q_ASSERT(_job);
0289         Q_ASSERT(_job->error() == NoError);
0290         Q_ASSERT(data != nullptr);
0291         if (maxSize == 0 || pos() >= _data.length()) {
0292             return 0;
0293         } else if (pos() < _data.length()) {
0294             qint64 bytesToCopy = qMin(maxSize, _data.length() - pos());
0295             memcpy(data, _data.data() + pos(), bytesToCopy);
0296             return bytesToCopy;
0297         } else {
0298             return -1;
0299         }
0300     }
0301 
0302 private:
0303     QUrl _url;
0304     KIO::StoredTransferJob *_job;
0305     Error _error;
0306     QEventLoop _eventLoop;
0307     QByteArray _data;
0308 };
0309 
0310 struct CategoryProperty {
0311     enum Flag : uint32_t {
0312         Invalid = 0,
0313 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
0314 #include "properties.h"
0315     };
0316     enum Group : uint32_t {
0317 #define CATEGORY_PROPERTY_GROUP(val, sym, intVal) sym = intVal,
0318 #include "properties.h"
0319     };
0320 
0321     CategoryProperty(uint32_t value = Unassigned)
0322         : _value(value)
0323     {
0324     }
0325     CategoryProperty(const QString &string)
0326         : _value(fromString(string))
0327     {
0328     }
0329     operator uint32_t &()
0330     {
0331         return _value;
0332     }
0333     operator const uint32_t &() const
0334     {
0335         return _value;
0336     }
0337     bool isValid() const
0338     {
0339         return _value != Invalid;
0340     }
0341 
0342 private:
0343     static uint32_t fromString(const QString &string)
0344     {
0345         static const QMap<QString, uint32_t> map = {
0346 #define CATEGORY_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym},
0347 #include "properties.h"
0348         };
0349         return map.contains(string) ? map[string] : uint8_t(Invalid);
0350     }
0351     uint32_t _value;
0352 };
0353 
0354 struct EastAsianWidthProperty {
0355     enum Value : uint8_t {
0356         Invalid = 0x80,
0357 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
0358 #include "properties.h"
0359     };
0360 
0361     EastAsianWidthProperty(uint8_t value = Neutral)
0362         : _value(value)
0363     {
0364     }
0365     EastAsianWidthProperty(const QString &string)
0366         : _value(fromString(string))
0367     {
0368     }
0369     operator uint8_t &()
0370     {
0371         return _value;
0372     }
0373     operator const uint8_t &() const
0374     {
0375         return _value;
0376     }
0377     bool isValid() const
0378     {
0379         return _value != Invalid;
0380     }
0381 
0382 private:
0383     static uint8_t fromString(const QString &string)
0384     {
0385         static const QMap<QString, Value> map = {
0386 #define EAST_ASIAN_WIDTH_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), Value::sym},
0387 #include "properties.h"
0388         };
0389         return map.contains(string) ? map[string] : Invalid;
0390     }
0391     uint8_t _value;
0392 };
0393 
0394 struct EmojiProperty {
0395     enum Flag : uint8_t {
0396         Invalid = 0x80,
0397 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) sym = intVal,
0398 #include "properties.h"
0399     };
0400 
0401     EmojiProperty(uint8_t value = None)
0402         : _value(value)
0403     {
0404     }
0405     EmojiProperty(const QString &string)
0406         : _value(fromString(string))
0407     {
0408     }
0409     operator uint8_t &()
0410     {
0411         return _value;
0412     }
0413     operator const uint8_t &() const
0414     {
0415         return _value;
0416     }
0417     bool isValid() const
0418     {
0419         return !(_value & Invalid);
0420     }
0421 
0422 private:
0423     static uint8_t fromString(const QString &string)
0424     {
0425         static const QMap<QString, uint8_t> map = {
0426 #define EMOJI_PROPERTY_VALUE(val, sym, intVal) {QStringLiteral(#val), sym},
0427 #include "properties.h"
0428         };
0429         return map.contains(string) ? map[string] : uint8_t(Invalid);
0430     }
0431     uint8_t _value;
0432 };
0433 
0434 struct CharacterWidth {
0435     enum Width : int8_t {
0436         Invalid = SCHAR_MIN,
0437         _VALID_START = -3,
0438         Ambiguous = -2,
0439         NonPrintable = -1,
0440         // 0
0441         // 1
0442         Unassigned = 1,
0443         // 2
0444         _VALID_END = 3,
0445     };
0446 
0447     CharacterWidth(const CharacterWidth &other)
0448         : _width(other._width)
0449     {
0450     }
0451     CharacterWidth(int8_t width = Invalid)
0452         : _width(width)
0453     {
0454     }
0455     CharacterWidth &operator=(const CharacterWidth &other)
0456     {
0457         _width = other._width;
0458         return *this;
0459     }
0460     int operator=(const int8_t width)
0461     {
0462         _width = width;
0463         return _width;
0464     }
0465     int width() const
0466     {
0467         return _width;
0468     }
0469     operator int() const
0470     {
0471         return width();
0472     }
0473 
0474     const QString toString() const
0475     {
0476         switch (_width) {
0477         case Ambiguous:
0478             return QStringLiteral("Ambiguous");
0479         case NonPrintable:
0480             return QStringLiteral("NonPrintable");
0481         case 0:
0482             return QStringLiteral("0");
0483         case 1:
0484             return QStringLiteral("1");
0485         case 2:
0486             return QStringLiteral("2");
0487         default:
0488         case Invalid:
0489             return QStringLiteral("Invalid");
0490         }
0491     }
0492 
0493     bool isValid() const
0494     {
0495         return (_width > _VALID_START && _width < _VALID_END);
0496     };
0497 
0498 private:
0499     int8_t _width;
0500 };
0501 
0502 struct CharacterProperties {
0503     CategoryProperty category;
0504     EastAsianWidthProperty eastAsianWidth;
0505     EmojiProperty emoji;
0506     CharacterWidth customWidth;
0507     // For debug purposes in "details" output generator
0508     uint8_t widthFromPropsRule;
0509 };
0510 
0511 struct UnicodeDataEntry : public UcdEntry {
0512     enum FieldId {
0513         NameId = 0,
0514         CategoryId = 1,
0515     };
0516     CategoryProperty category() const
0517     {
0518         return CategoryProperty(this->fields.value(CategoryId));
0519     }
0520 };
0521 
0522 struct EastAsianWidthEntry : public UcdEntry {
0523     enum FieldId {
0524         WidthId = 0,
0525     };
0526     EastAsianWidthProperty eastAsianWidth() const
0527     {
0528         return EastAsianWidthProperty(this->fields.value(WidthId));
0529     }
0530 };
0531 
0532 struct EmojiDataEntry : public UcdEntry {
0533     enum FieldId {
0534         EmojiId = 0,
0535     };
0536     EmojiProperty emoji() const
0537     {
0538         return EmojiProperty(this->fields.value(EmojiId));
0539     }
0540 };
0541 
0542 struct GenericWidthEntry : public UcdEntry {
0543     enum FieldId {
0544         WidthId = 0,
0545     };
0546     CharacterWidth width() const
0547     {
0548         bool ok;
0549         CharacterWidth w = this->fields.value(WidthId).toInt(&ok, 10);
0550         return (ok && w.isValid()) ? w : CharacterWidth::Invalid;
0551     }
0552 };
0553 
0554 struct WidthsRange {
0555     struct {
0556         uint first;
0557         uint last;
0558     } cp;
0559     CharacterWidth width;
0560 };
0561 
0562 QVector<WidthsRange> rangesFromWidths(const QVector<CharacterWidth> &widths, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM})
0563 {
0564     QVector<WidthsRange> ranges;
0565 
0566     if (ucsRange.second >= CODE_POINTS_NUM)
0567         ucsRange.second = widths.size() - 1;
0568 
0569     uint first = ucsRange.first;
0570     for (uint cp = first + 1; cp <= uint(ucsRange.second); ++cp) {
0571         if (widths[first] != widths[cp]) {
0572             ranges.append({{first, cp - 1}, widths[cp - 1]});
0573             first = cp;
0574         }
0575     }
0576     ranges.append({{first, uint(ucsRange.second)}, widths[ucsRange.second]});
0577 
0578     return ranges;
0579 }
0580 
0581 // Real ranges look like this (each continuous letter sequence is a range):
0582 //
0583 //     D    D D D   D D        D D                   8 ranges
0584 //         C C   C C C C     CC C CC                 9 ranges
0585 //  BBB BBB       B     B BBB       BBBBBB           6 ranges
0586 // A           A         A                A          4 ranges
0587 //                                               ∑: 27 ranges
0588 //
0589 // To reduce total ranges count, the holes in groups can be filled with ranges
0590 // from groups above them:
0591 //
0592 //     D    D D D   D D        D D                   8 ranges
0593 //         CCC   C CCCCC     CCCCCCC                 4 ranges
0594 //  BBBBBBB       BBBBBBB BBBBBBBBBBBBBBBB           3 ranges
0595 // AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA          1 ranges
0596 //                                               ∑: 16 ranges
0597 //
0598 // First range is always without change. Last range (A) can be dropped
0599 // (it always contains everything). Search should be done in order: D, C, B (A).
0600 // For simplicity the function returns all ranges, including first and last.
0601 QMap<CharacterWidth, QVector<QPair<uint, uint>>>
0602 mergedRangesFromWidths(const QVector<CharacterWidth> &widths, const QVector<CharacterWidth> widthsSortOrder, QPair<uint, uint> ucsRange = {0, CODE_POINTS_NUM})
0603 {
0604     if (ucsRange.second >= CODE_POINTS_NUM)
0605         ucsRange.second = widths.size() - 1;
0606     QVector<WidthsRange> ranges = rangesFromWidths(widths, ucsRange);
0607     QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges;
0608 
0609     int cmwi; // Currently Merged Width Index
0610     int sri = -1; // Start Range Index (for current width)
0611     int cri; // Current Range Index
0612 
0613     // First width ranges are without change. Last one has one range spanning everything, so we can skip this
0614     for (cmwi = 1; cmwi < widthsSortOrder.size() - 1; ++cmwi) {
0615         const CharacterWidth &cmw = widthsSortOrder[cmwi]; // Currently Merged Width
0616         for (cri = 0; cri < ranges.size(); ++cri) {
0617             WidthsRange &cr = ranges[cri]; // Current Range
0618             if (cr.width == cmw) {
0619                 // Range is suitable for merge
0620                 if (sri < 0) {
0621                     // First one, just remember it
0622                     sri = cri;
0623                 } else {
0624                     // Merge
0625                     ranges[sri].cp.last = cr.cp.last;
0626                     cr.width = CharacterWidth::Invalid;
0627                 }
0628             } else {
0629                 // Current range has another width - can we continue merging?
0630                 if (sri >= 0) {
0631                     const int crwi = widthsSortOrder.indexOf(cr.width); // Current Range Width Index
0632                     if (!(crwi < cmwi && crwi >= 0)) {
0633                         // current range is not above currently merged width - stop merging
0634                         sri = -1;
0635                     }
0636                 }
0637             }
0638         }
0639     }
0640 
0641     for (const auto &range : std::as_const(ranges)) {
0642         if (range.width.isValid() && range.width != widthsSortOrder.last())
0643             mergedRanges[range.width].append({range.cp.first, range.cp.last});
0644     }
0645     mergedRanges[widthsSortOrder.last()].append({ucsRange.first, ucsRange.second});
0646 
0647     return mergedRanges;
0648 }
0649 
0650 namespace generators
0651 {
0652 using GeneratorFunc = bool (*)(QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &);
0653 
0654 bool code(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
0655 {
0656     static constexpr int DIRECT_LUT_SIZE = 256;
0657 
0658     Q_UNUSED(props);
0659     QTextStream eout(stderr, QIODevice::WriteOnly);
0660 
0661     if (args.value(QStringLiteral("param")).isEmpty()) {
0662         eout << QStringLiteral("Template file not specified.") << Qt::endl << Qt::endl;
0663         return false;
0664     }
0665     QFile templateFile(args.value(QStringLiteral("param")));
0666     if (!templateFile.open(QIODevice::ReadOnly)) {
0667         eout << QStringLiteral("Could not open file ") << templateFile.fileName() << ": " << templateFile.errorString();
0668         exit(1);
0669     }
0670 
0671     const QString templateText = QString::fromUtf8(templateFile.readAll());
0672     templateFile.close();
0673 
0674     Var::Map data = {
0675         {QStringLiteral("gen-file-warning"), QStringLiteral("THIS IS A GENERATED FILE. DO NOT EDIT.")},
0676         {QStringLiteral("cmdline"), args.value(QStringLiteral("cmdline"))},
0677         {QStringLiteral("direct-lut"), Var::Vector(DIRECT_LUT_SIZE)},
0678         {QStringLiteral("direct-lut-size"), DIRECT_LUT_SIZE},
0679         {QStringLiteral("ranges-luts"), Var::Vector()},
0680         {QStringLiteral("ranges-lut-list"), Var::Vector()},
0681         {QStringLiteral("ranges-lut-list-size"), 0},
0682     };
0683 
0684     // Fill direct-lut with widths of 0x00-0xFF
0685     for (unsigned i = 0; i < DIRECT_LUT_SIZE; ++i) {
0686         Q_ASSERT(widths[i].isValid());
0687         data[QStringLiteral("direct-lut")].vec[i] = int(widths[i]);
0688     }
0689 
0690     static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1};
0691     const QMap<CharacterWidth, QVector<QPair<uint, uint>>> mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder, {DIRECT_LUT_SIZE, CODE_POINTS_NUM});
0692 
0693     // Find last non-empty ranges lut
0694     int lastWidthId = 0;
0695     for (int wi = widthsSortOrder.size() - 1; wi > 0; --wi) {
0696         if (mergedRanges.contains(widthsSortOrder[wi])) {
0697             lastWidthId = wi;
0698             break;
0699         }
0700     }
0701     // Create ranges-luts for all widths except last non-empty one and empty ones
0702     for (int wi = 0; lastWidthId != 0 && wi < lastWidthId; ++wi) {
0703         const CharacterWidth width = widthsSortOrder[wi];
0704         auto currentMergedRangesIt = mergedRanges.find(width);
0705         if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty())
0706             continue;
0707         const int size = mergedRanges[width].size();
0708         const QString name = QString(QStringLiteral("LUT_%1")).arg(width.toString().toUpper());
0709         data[QStringLiteral("ranges-luts")].vec.append(Var::Map{
0710             {QStringLiteral("name"), name},
0711             {QStringLiteral("ranges"), Var::Vector()},
0712             {QStringLiteral("size"), size},
0713         });
0714         data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{
0715             {QStringLiteral("width"), int(width)},
0716             {QStringLiteral("name"), name},
0717             {QStringLiteral("size"), size},
0718         });
0719         auto &currentLut = data[QStringLiteral("ranges-luts")].vec.last()[QStringLiteral("ranges")].vec;
0720         for (const auto &range : *currentMergedRangesIt) {
0721             Q_ASSERT(range.first <= LAST_CODE_POINT);
0722             Q_ASSERT(range.second <= LAST_CODE_POINT);
0723             currentLut.append(Var(Var::Map{{QStringLiteral("first"), range.first}, {QStringLiteral("last"), range.second}}));
0724         }
0725     }
0726     data[QStringLiteral("ranges-lut-list")].vec.append(Var::Map{
0727         {QStringLiteral("width"), widthsSortOrder[lastWidthId].width()},
0728         {QStringLiteral("name"), QStringLiteral("nullptr")},
0729         {QStringLiteral("size"), 1},
0730     });
0731     data[QStringLiteral("ranges-lut-list-size")] = mergedRanges.size();
0732 
0733     Template t(templateText);
0734     t.parse();
0735     out << t.generate(data);
0736 
0737     return true;
0738 }
0739 
0740 bool list(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
0741 {
0742     Q_UNUSED(props);
0743 
0744     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
0745     for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) {
0746         out << QString::asprintf("%06X ; %2d\n", cp, int(widths[cp]));
0747     }
0748 
0749     return true;
0750 }
0751 
0752 bool ranges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
0753 {
0754     Q_UNUSED(props);
0755     const auto ranges = rangesFromWidths(widths);
0756 
0757     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
0758     for (const WidthsRange &range : ranges) {
0759         if (range.cp.first != range.cp.last)
0760             out << QString::asprintf("%06X..%06X ; %2d\n", range.cp.first, range.cp.last, int(range.width));
0761         else
0762             out << QString::asprintf("%06X         ; %2d\n", range.cp.first, int(range.width));
0763     }
0764 
0765     return true;
0766 }
0767 
0768 bool compactRanges(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
0769 {
0770     Q_UNUSED(props);
0771     static const QVector<CharacterWidth> widthsSortOrder = {CharacterWidth::NonPrintable, 2, CharacterWidth::Ambiguous, 0, 1};
0772     const auto mergedRanges = mergedRangesFromWidths(widths, widthsSortOrder);
0773 
0774     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
0775     for (const int width : std::as_const(widthsSortOrder)) {
0776         const auto currentMergedRangesIt = mergedRanges.find(width);
0777         if (currentMergedRangesIt == mergedRanges.end() || currentMergedRangesIt.value().isEmpty())
0778             continue;
0779         for (const auto &range : currentMergedRangesIt.value()) {
0780             if (range.first != range.second)
0781                 out << QString::asprintf("%06X..%06X ; %2d\n", range.first, range.second, int(width));
0782             else
0783                 out << QString::asprintf("%06X         ; %2d\n", range.first, int(width));
0784         }
0785     }
0786 
0787     return true;
0788 }
0789 
0790 bool details(QTextStream &out, const QVector<CharacterProperties> &props, const QVector<CharacterWidth> &widths, const QMap<QString, QString> &args)
0791 {
0792     out.setFieldAlignment(QTextStream::AlignLeft);
0793 
0794     out << QStringLiteral("# generated with: ") << args.value(QStringLiteral("cmdline")) << QStringLiteral("\n");
0795     out << QString::asprintf("#%-5s ; %-4s ; %-8s ; %-3s ; %-2s ; %-4s ; %-4s\n", "CP", "Wdth", "Cat", "EAW", "EM", "CstW", "Rule");
0796     QMap<CharacterWidth, uint> widthStats;
0797     for (uint cp = 0; cp <= LAST_CODE_POINT; ++cp) {
0798         out << QString::asprintf("%06X ; %4d ; %08X ;  %02X ; %02X ; %4d ; %d\n",
0799                                  cp,
0800                                  int8_t(widths[cp]),
0801                                  uint32_t(props[cp].category),
0802                                  uint8_t(props[cp].eastAsianWidth),
0803                                  uint8_t(props[cp].emoji),
0804                                  int8_t(props[cp].customWidth),
0805                                  props[cp].widthFromPropsRule);
0806         if (!widthStats.contains(widths[cp]))
0807             widthStats.insert(widths[cp], 0);
0808         widthStats[widths[cp]]++;
0809     }
0810     QMap<CharacterWidth, uint> rangesStats;
0811     const auto ranges = rangesFromWidths(widths);
0812     for (const auto &range : ranges) {
0813         if (!rangesStats.contains(range.width))
0814             rangesStats.insert(range.width, 0);
0815         rangesStats[range.width]++;
0816     }
0817     out << QStringLiteral("# STATS") << Qt::endl;
0818     out << QStringLiteral("#") << Qt::endl;
0819     out << QStringLiteral("# Characters count for each width:") << Qt::endl;
0820     for (auto wi = widthStats.constBegin(); wi != widthStats.constEnd(); ++wi) {
0821         out << QString::asprintf("# %2d: %7d\n", int(wi.key()), widthStats[wi.key()]);
0822     }
0823     out << QStringLiteral("#") << Qt::endl;
0824     out << QStringLiteral("# Ranges count for each width:") << Qt::endl;
0825     int howmany = 0;
0826     for (auto wi = rangesStats.constBegin(); wi != rangesStats.constEnd(); ++wi) {
0827         if (howmany >= 20)
0828             break;
0829         howmany++;
0830         out << QString::asprintf("# %2d: %7d\n", int(wi.key()), rangesStats[wi.key()]);
0831     }
0832 
0833     return true;
0834 }
0835 } // namespace generators
0836 
0837 template<class EntryType>
0838 static void processInputFiles(QVector<CharacterProperties> &props,
0839                               const QStringList &files,
0840                               const QString &fileTypeName,
0841                               void (*cb)(CharacterProperties &prop, const EntryType &entry))
0842 {
0843     static const QRegularExpression PROTOCOL_RE(QStringLiteral(R"#(^[a-z]+://)#"));
0844     for (const QString &fileName : files) {
0845         qInfo().noquote() << QStringLiteral("Parsing as %1: %2").arg(fileTypeName).arg(fileName);
0846         QSharedPointer<QIODevice> source = nullptr;
0847         if (PROTOCOL_RE.match(fileName).hasMatch()) {
0848             source.reset(new KIODevice(QUrl(fileName)));
0849         } else {
0850             source.reset(new QFile(fileName));
0851         }
0852 
0853         if (!source->open(QIODevice::ReadOnly)) {
0854             qCritical() << QStringLiteral("Could not open %1: %2").arg(fileName).arg(source->errorString());
0855             exit(1);
0856         }
0857         UcdParser<EntryType> p(source.data());
0858         while (p.hasNext()) {
0859             const auto &e = p.next();
0860             for (uint cp = e.cp.first; cp <= e.cp.last; ++cp) {
0861                 cb(props[cp], e);
0862             }
0863         }
0864     }
0865 }
0866 
0867 static const QString escapeCmdline(const QStringList &args)
0868 {
0869     static QString cmdline = QString();
0870     if (!cmdline.isEmpty())
0871         return cmdline;
0872 
0873     QTextStream stream(&cmdline, QIODevice::WriteOnly);
0874 
0875     // basename for command name
0876     stream << QFileInfo(args[0]).baseName();
0877     for (auto it = args.begin() + 1; it != args.end(); ++it) {
0878         if (!it->startsWith(QLatin1Char('-')))
0879             stream << QStringLiteral(" \"") << QString(*it).replace(QRegularExpression(QStringLiteral(R"(["`$\\])")), QStringLiteral(R"(\\\1)")) << '"';
0880         else
0881             stream << ' ' << *it;
0882     }
0883     stream.flush();
0884     return cmdline;
0885 }
0886 
0887 enum ConvertOptions {
0888     AmbiguousWidthOpt = 0,
0889     EmojiOpt = 1,
0890 };
0891 
0892 // Character width assignment
0893 //
0894 // Rules (from highest to lowest priority):
0895 //
0896 // * Local overlay
0897 // * (not implemented) Character unique properties described in The Unicode Standard, Version 10.0
0898 // * Unicode category Cc, Cs: -1
0899 // * Emoji: 2
0900 // * Unicode category Mn, Me, Cf: 0
0901 // * East Asian Width W, F: 2
0902 // * East Asian Width H, N, Na: 1
0903 // * East Asian Width A: (varies)
0904 // * Unassigned/Undefined/Private Use: 1
0905 //
0906 // The list is loosely based on character width implementations in Vim 8.1
0907 // and glibc 2.27. There are a few cases which could look better
0908 // (decomposed Hangul, emoji with modifiers, etc) with different widths,
0909 // but interactive terminal programs (at least vim, zsh, everything based
0910 // on glibc's wcwidth) would see their width as it is implemented now.
0911 static inline CharacterWidth widthFromProps(const CharacterProperties &props, uint cp, const QMap<ConvertOptions, int> &convertOpts)
0912 {
0913     CharacterWidth cw;
0914     auto &widthFromPropsRule = const_cast<uint8_t &>(props.widthFromPropsRule);
0915     if (props.customWidth.isValid()) {
0916         widthFromPropsRule = 1;
0917         cw = props.customWidth;
0918 
0919     } else if ((CategoryProperty::Control | CategoryProperty::Surrogate) & props.category) {
0920         widthFromPropsRule = 2;
0921         cw = CharacterWidth::NonPrintable;
0922 
0923     } else if (convertOpts[EmojiOpt] & props.emoji && !(EmojiProperty::EmojiComponent & props.emoji)) {
0924         widthFromPropsRule = 3;
0925         cw = 2;
0926 
0927     } else if ((CategoryProperty::NonspacingMark | CategoryProperty::EnclosingMark | CategoryProperty::Format) & props.category) {
0928         widthFromPropsRule = 4;
0929         cw = 0;
0930 
0931     } else if ((EastAsianWidthProperty::Wide | EastAsianWidthProperty::Fullwidth) & props.eastAsianWidth) {
0932         widthFromPropsRule = 5;
0933         cw = 2;
0934 
0935     } else if ((EastAsianWidthProperty::Halfwidth | EastAsianWidthProperty::Neutral | EastAsianWidthProperty::Narrow) & props.eastAsianWidth) {
0936         widthFromPropsRule = 6;
0937         cw = 1;
0938 
0939     } else if ((CategoryProperty::Unassigned | CategoryProperty::PrivateUse) & props.category) {
0940         widthFromPropsRule = 7;
0941         cw = CharacterWidth::Unassigned;
0942 
0943     } else if ((EastAsianWidthProperty::Ambiguous)&props.eastAsianWidth) {
0944         widthFromPropsRule = 8;
0945         cw = convertOpts[AmbiguousWidthOpt];
0946 
0947     } else if (!props.category.isValid()) {
0948         widthFromPropsRule = 9;
0949         qWarning() << QStringLiteral("Code point U+%1 has invalid category - this should not happen. Assuming \"unassigned\"").arg(cp, 4, 16, QLatin1Char('0'));
0950         cw = CharacterWidth::Unassigned;
0951 
0952     } else {
0953         widthFromPropsRule = 10;
0954         qWarning()
0955             << QStringLiteral("Code point U+%1 not classified - this should not happen. Assuming non-printable character").arg(cp, 4, 16, QLatin1Char('0'));
0956         cw = CharacterWidth::NonPrintable;
0957     }
0958 
0959     return cw;
0960 }
0961 
0962 int main(int argc, char *argv[])
0963 {
0964     static const QMap<QString, generators::GeneratorFunc> GENERATOR_FUNCS_MAP = {
0965         {QStringLiteral("code"), generators::code},
0966         {QStringLiteral("compact-ranges"), generators::compactRanges},
0967         {QStringLiteral("ranges"), generators::ranges},
0968         {QStringLiteral("list"), generators::list},
0969         {QStringLiteral("details"), generators::details},
0970         {QStringLiteral("dummy"),
0971          [](QTextStream &, const QVector<CharacterProperties> &, const QVector<CharacterWidth> &, const QMap<QString, QString> &) -> bool {
0972              return true;
0973          }},
0974     };
0975     qSetMessagePattern(QStringLiteral("%{message}"));
0976 
0977     QCoreApplication app(argc, argv);
0978     QCommandLineParser parser;
0979     parser.setApplicationDescription(QStringLiteral("\nUCD files to characters widths converter.\n"));
0980     parser.addHelpOption();
0981     parser.addOptions({
0982         {{QStringLiteral("U"), QStringLiteral("unicode-data")}, QStringLiteral("Path or URL to UnicodeData.txt."), QStringLiteral("URL|file")},
0983         {{QStringLiteral("A"), QStringLiteral("east-asian-width")}, QStringLiteral("Path or URL to EastAsianWidth.txt."), QStringLiteral("URL|file")},
0984         {{QStringLiteral("E"), QStringLiteral("emoji-data")}, QStringLiteral("Path or URL to emoji-data.txt."), QStringLiteral("URL|file")},
0985         {{QStringLiteral("W"), QStringLiteral("generic-width")},
0986          QStringLiteral("Path or URL to generic file with width data. Accepts output from compact-ranges, ranges, list and details generator."),
0987          QStringLiteral("URL|file")},
0988 
0989         {QStringLiteral("ambiguous-width"),
0990          QStringLiteral("Ambiguous characters width."),
0991          QStringLiteral("separate|1|2"),
0992          QString(QStringLiteral("%1")).arg(CharacterWidth::Ambiguous)},
0993         {QStringLiteral("emoji"),
0994          QStringLiteral("Which emoji emoji subset is treated as emoji."),
0995          QStringLiteral("all|presentation"),
0996          QStringLiteral("presentation")},
0997 
0998         {{QStringLiteral("g"), QStringLiteral("generator")},
0999          QStringLiteral("Output generator (use \"-\" to list available generators). The code generator requires path to a template file."),
1000          QStringLiteral("generator[:template]"),
1001          QStringLiteral("details")},
1002     });
1003     parser.addPositionalArgument(QStringLiteral("output"), QStringLiteral("Output file (leave empty for stdout)."));
1004     parser.process(app);
1005 
1006     const QStringList unicodeDataFiles = parser.values(QStringLiteral("unicode-data"));
1007     const QStringList eastAsianWidthFiles = parser.values(QStringLiteral("east-asian-width"));
1008     const QStringList emojiDataFiles = parser.values(QStringLiteral("emoji-data"));
1009     const QStringList genericWidthFiles = parser.values(QStringLiteral("generic-width"));
1010     const QString ambiguousWidthStr = parser.value(QStringLiteral("ambiguous-width"));
1011     const QString emojiStr = parser.value(QStringLiteral("emoji"));
1012     const QString generator = parser.value(QStringLiteral("generator"));
1013     const QString outputFileName = parser.positionalArguments().value(0);
1014 
1015     QTextStream eout(stderr, QIODevice::WriteOnly);
1016     if (unicodeDataFiles.isEmpty() && eastAsianWidthFiles.isEmpty() && emojiDataFiles.isEmpty() && genericWidthFiles.isEmpty()) {
1017         eout << QStringLiteral("Input files not specified.") << Qt::endl << Qt::endl;
1018         parser.showHelp(1);
1019     }
1020 
1021     static QMap<ConvertOptions, int> convertOpts = {
1022         {AmbiguousWidthOpt, CharacterWidth::Ambiguous},
1023         {EmojiOpt, EmojiProperty::EmojiPresentation},
1024     };
1025 
1026     if (emojiStr == QStringLiteral("presentation"))
1027         convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation;
1028     else if (emojiStr == QStringLiteral("all"))
1029         convertOpts[EmojiOpt] = EmojiProperty::Emoji;
1030     else {
1031         convertOpts[EmojiOpt] = EmojiProperty::EmojiPresentation;
1032         qWarning() << QStringLiteral("invalid emoji option value: %1. Assuming \"presentation\".").arg(emojiStr);
1033     }
1034 
1035     if (ambiguousWidthStr == QStringLiteral("separate"))
1036         convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous;
1037     else if (ambiguousWidthStr == QStringLiteral("1"))
1038         convertOpts[AmbiguousWidthOpt] = 1;
1039     else if (ambiguousWidthStr == QStringLiteral("2"))
1040         convertOpts[AmbiguousWidthOpt] = 2;
1041     else {
1042         convertOpts[AmbiguousWidthOpt] = CharacterWidth::Ambiguous;
1043         qWarning() << QStringLiteral("Invalid ambiguous-width option value: %1. Assuming \"separate\".").arg(emojiStr);
1044     }
1045 
1046     const int sepPos = generator.indexOf(QLatin1Char(':'));
1047     const auto generatorName = generator.left(sepPos);
1048     const auto generatorParam = sepPos >= 0 ? generator.mid(sepPos + 1) : QString();
1049 
1050     if (!GENERATOR_FUNCS_MAP.contains(generatorName)) {
1051         int status = 0;
1052         if (generatorName != QStringLiteral("-")) {
1053             status = 1;
1054             eout << QStringLiteral("Invalid output generator. Available generators:") << Qt::endl;
1055         }
1056 
1057         for (auto it = GENERATOR_FUNCS_MAP.constBegin(); it != GENERATOR_FUNCS_MAP.constEnd(); ++it) {
1058             eout << it.key() << Qt::endl;
1059         }
1060         exit(status);
1061     }
1062     auto generatorFunc = GENERATOR_FUNCS_MAP[generatorName];
1063 
1064     QFile outFile;
1065     if (!outputFileName.isEmpty()) {
1066         outFile.setFileName(outputFileName);
1067         if (!outFile.open(QIODevice::WriteOnly)) {
1068             eout << QStringLiteral("Could not open file ") << outputFileName << QStringLiteral(": ") << outFile.errorString() << Qt::endl;
1069             exit(1);
1070         }
1071     } else {
1072         outFile.open(stdout, QIODevice::WriteOnly);
1073     }
1074     QTextStream out(&outFile);
1075 
1076     QVector<CharacterProperties> props(CODE_POINTS_NUM);
1077 
1078     processInputFiles<UnicodeDataEntry>(props,
1079                                         unicodeDataFiles,
1080                                         QStringLiteral("UnicodeData.txt"),
1081                                         [](CharacterProperties &prop, const UnicodeDataEntry &entry) {
1082                                             prop.category = entry.category();
1083                                         });
1084 
1085     processInputFiles<EastAsianWidthEntry>(props,
1086                                            eastAsianWidthFiles,
1087                                            QStringLiteral("EastAsianWidth.txt"),
1088                                            [](CharacterProperties &prop, const EastAsianWidthEntry &entry) {
1089                                                prop.eastAsianWidth = entry.eastAsianWidth();
1090                                            });
1091 
1092     processInputFiles<EmojiDataEntry>(props, emojiDataFiles, QStringLiteral("emoji-data.txt"), [](CharacterProperties &prop, const EmojiDataEntry &entry) {
1093         prop.emoji |= entry.emoji();
1094     });
1095 
1096     processInputFiles<GenericWidthEntry>(props,
1097                                          genericWidthFiles,
1098                                          QStringLiteral("generic width data"),
1099                                          [](CharacterProperties &prop, const GenericWidthEntry &entry) {
1100                                              prop.customWidth = entry.width();
1101                                          });
1102 
1103     qInfo() << "Generating character width data";
1104     QVector<CharacterWidth> widths(CODE_POINTS_NUM);
1105     widths[0] = 0; // NULL character always has width 0
1106     for (uint cp = 1; cp <= LAST_CODE_POINT; ++cp) {
1107         widths[cp] = widthFromProps(props[cp], cp, convertOpts);
1108     }
1109 
1110     const QMap<QString, QString> generatorArgs = {
1111         {QStringLiteral("cmdline"), escapeCmdline(app.arguments())},
1112         {QStringLiteral("param"), generatorParam},
1113         {QStringLiteral("output"), outputFileName.isEmpty() ? QStringLiteral("<stdout>") : outputFileName},
1114     };
1115 
1116     qInfo() << "Generating output";
1117     if (!generatorFunc(out, props, widths, generatorArgs)) {
1118         parser.showHelp(1);
1119     }
1120 
1121     return 0;
1122 }