File indexing completed on 2024-07-14 14:39:54

0001 /**
0002  * parsetrigrams.cpp
0003  *
0004  * Parse a corpus of data and generate trigrams
0005  *
0006  * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
0007  *
0008  * SPDX-License-Identifier: LGPL-2.1-or-later
0009  */
0010 
0011 #include "guesslanguage.h"
0012 
0013 #include <QDebug>
0014 #include <QFile>
0015 #include <QHash>
0016 #include <QString>
0017 
0018 int main(int argc, char *argv[])
0019 {
0020     if (argc < 3) {
0021         qWarning() << argv[0] << "corpus.txt outfile.trigram";
0022         return -1;
0023     }
0024 
0025     QFile file(QString::fromLocal8Bit(argv[1]));
0026     if (!file.open(QIODevice::ReadOnly | QFile::Text)) {
0027         qWarning() << "Unable to open corpus:" << argv[1];
0028         return -1;
0029     }
0030     QTextStream stream(&file);
0031     // Not needed with Qt6, UTF-8 is the default
0032 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0033     stream.setCodec("UTF-8");
0034 #endif
0035 
0036     QFile outFile(QString::fromLocal8Bit(argv[2]));
0037     if (!outFile.open(QIODevice::WriteOnly)) {
0038         qWarning() << "Unable to open output file" << argv[2];
0039         return -1;
0040     }
0041 
0042     QHash<QString, int> model;
0043     qDebug() << "Reading in" << file.size() << "bytes";
0044     QString trigram = stream.read(3);
0045     QString contents = stream.readAll();
0046     qDebug() << "finished reading!";
0047     qDebug() << "Building model...";
0048     for (int i = 0; i < contents.size(); i++) {
0049         if (!contents[i].isPrint()) {
0050             continue;
0051         }
0052         model[trigram]++;
0053         trigram[0] = trigram[1];
0054         trigram[1] = trigram[2];
0055         trigram[2] = contents[i];
0056     }
0057     qDebug() << "model built!";
0058 
0059     qDebug() << "Sorting...";
0060     QMultiMap<int, QString> orderedTrigrams;
0061 
0062     for (auto it = model.cbegin(); it != model.cend(); ++it) {
0063         const QString data = it.key();
0064         Q_ASSERT(data.size() >= 3);
0065         bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) //
0066                              || (data.size() > 2 && data[1].isSpace() && data[2].isSpace()));
0067 
0068         if (!hasTwoSpaces) {
0069             orderedTrigrams.insert(it.value(), data);
0070         }
0071     }
0072 
0073     qDebug() << "Sorted!";
0074 
0075     qDebug() << "Weeding out...";
0076 
0077     auto i = orderedTrigrams.begin();
0078     while (orderedTrigrams.size() > Sonnet::MAXGRAMS) {
0079         i = orderedTrigrams.erase(i);
0080     }
0081     qDebug() << "Weeded!";
0082 
0083     qDebug() << "Storing...";
0084     i = orderedTrigrams.end();
0085     int count = 0;
0086     QTextStream outStream(&outFile);
0087     // Not needed with Qt6, UTF-8 is the default
0088 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0)
0089     outStream.setCodec("UTF-8");
0090 #endif
0091 
0092     while (i != orderedTrigrams.begin()) {
0093         --i;
0094         outStream << *i << "\t\t\t" << count++ << '\n';
0095     }
0096 }