File indexing completed on 2024-04-21 15:07:18

0001 /**
0002  * parsetrigrams.cpp
0003  *
0004  * Parse a set of trigram files into a QMap, and serialize to stdout.
0005  * Note: we allow this data to be read into QHash. We use QMap here
0006  * to get deterministic output from run to run.
0007  *
0008  * SPDX-FileCopyrightText: 2006 Jacob Rideout <kde@jacobrideout.net>
0009  *
0010  * SPDX-License-Identifier: LGPL-2.1-or-later
0011  */
0012 
0013 #include <QDataStream>
0014 #include <QDir>
0015 #include <QFile>
0016 #include <QMap>
0017 #include <QRegularExpression>
0018 #include <QString>
0019 #include <QTextStream>
0020 
0021 int main(int argc, char **argv)
0022 {
0023     if (argc < 2) {
0024         return 1;
0025     }
0026 
0027     QFile sout;
0028     sout.open(stdout, QIODevice::WriteOnly);
0029     QDataStream out(&sout);
0030 
0031     QString path = QLatin1String(argv[1]);
0032     QDir td(path);
0033 
0034     /*
0035      * We use QMap (instead of QHash) here to get deterministic output
0036      * from run to run.
0037      */
0038     QMap<QString, QMap<QString, int>> models;
0039 
0040     const QRegularExpression rx(QStringLiteral("(?:.{3})\\s+(.*)"));
0041     const QStringList files = td.entryList(QDir::Files);
0042     for (const QString &fname : files) {
0043         QFile fin(td.filePath(fname));
0044         fin.open(QFile::ReadOnly | QFile::Text);
0045         QTextStream stream(&fin);
0046 
0047         // Not needed with Qt6, UTF-8 is the default
0048 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0049         stream.setCodec("UTF-8");
0050 #endif
0051         while (!stream.atEnd()) {
0052             QString line = stream.readLine();
0053             const QRegularExpressionMatch match = rx.match(line);
0054             if (match.hasMatch()) {
0055 #if QT_VERSION >= QT_VERSION_CHECK(6, 0, 0)
0056                 models[fname][line.left(3)] = match.capturedView(1).toInt();
0057 #else
0058                 models[fname][line.left(3)] = match.capturedRef(1).toInt();
0059 #endif
0060             }
0061         }
0062     }
0063 
0064     out << models;
0065 }