File indexing completed on 2024-04-21 04:00:53

0001 /**
0002  * parsetrigrams.cpp
0003  *
0004  * Parse a corpus of data and generate trigrams
0005  *
0006  * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org>
0007  *
0008  * SPDX-License-Identifier: LGPL-2.1-or-later
0009  */
0010 
0011 #include "guesslanguage.h"
0012 
0013 #include <QDebug>
0014 #include <QFile>
0015 #include <QHash>
0016 #include <QString>
0017 
0018 int main(int argc, char *argv[])
0019 {
0020     if (argc < 3) {
0021         qWarning() << argv[0] << "corpus.txt outfile.trigram";
0022         return -1;
0023     }
0024 
0025     QFile file(QString::fromLocal8Bit(argv[1]));
0026     if (!file.open(QIODevice::ReadOnly | QFile::Text)) {
0027         qWarning() << "Unable to open corpus:" << argv[1];
0028         return -1;
0029     }
0030     QTextStream stream(&file);
0031 
0032     QFile outFile(QString::fromLocal8Bit(argv[2]));
0033     if (!outFile.open(QIODevice::WriteOnly)) {
0034         qWarning() << "Unable to open output file" << argv[2];
0035         return -1;
0036     }
0037 
0038     QHash<QString, int> model;
0039     qDebug() << "Reading in" << file.size() << "bytes";
0040     QString trigram = stream.read(3);
0041     QString contents = stream.readAll();
0042     qDebug() << "finished reading!";
0043     qDebug() << "Building model...";
0044     for (int i = 0; i < contents.size(); i++) {
0045         if (!contents[i].isPrint()) {
0046             continue;
0047         }
0048         model[trigram]++;
0049         trigram[0] = trigram[1];
0050         trigram[1] = trigram[2];
0051         trigram[2] = contents[i];
0052     }
0053     qDebug() << "model built!";
0054 
0055     qDebug() << "Sorting...";
0056     QMultiMap<int, QString> orderedTrigrams;
0057 
0058     for (auto it = model.cbegin(); it != model.cend(); ++it) {
0059         const QString data = it.key();
0060         Q_ASSERT(data.size() >= 3);
0061         bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) //
0062                              || (data.size() > 2 && data[1].isSpace() && data[2].isSpace()));
0063 
0064         if (!hasTwoSpaces) {
0065             orderedTrigrams.insert(it.value(), data);
0066         }
0067     }
0068 
0069     qDebug() << "Sorted!";
0070 
0071     qDebug() << "Weeding out...";
0072 
0073     auto i = orderedTrigrams.begin();
0074     while (orderedTrigrams.size() > Sonnet::MAXGRAMS) {
0075         i = orderedTrigrams.erase(i);
0076     }
0077     qDebug() << "Weeded!";
0078 
0079     qDebug() << "Storing...";
0080     i = orderedTrigrams.end();
0081     int count = 0;
0082     QTextStream outStream(&outFile);
0083 
0084     while (i != orderedTrigrams.begin()) {
0085         --i;
0086         outStream << *i << "\t\t\t" << count++ << '\n';
0087     }
0088 }