File indexing completed on 2025-03-16 03:48:09
0001 /** 0002 * parsetrigrams.cpp 0003 * 0004 * Parse a corpus of data and generate trigrams 0005 * 0006 * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> 0007 * 0008 * SPDX-License-Identifier: LGPL-2.1-or-later 0009 */ 0010 0011 #include "guesslanguage.h" 0012 0013 #include <QDebug> 0014 #include <QFile> 0015 #include <QHash> 0016 #include <QString> 0017 0018 int main(int argc, char *argv[]) 0019 { 0020 if (argc < 3) { 0021 qWarning() << argv[0] << "corpus.txt outfile.trigram"; 0022 return -1; 0023 } 0024 0025 QFile file(QString::fromLocal8Bit(argv[1])); 0026 if (!file.open(QIODevice::ReadOnly | QFile::Text)) { 0027 qWarning() << "Unable to open corpus:" << argv[1]; 0028 return -1; 0029 } 0030 QTextStream stream(&file); 0031 0032 QFile outFile(QString::fromLocal8Bit(argv[2])); 0033 if (!outFile.open(QIODevice::WriteOnly)) { 0034 qWarning() << "Unable to open output file" << argv[2]; 0035 return -1; 0036 } 0037 0038 QHash<QString, int> model; 0039 qDebug() << "Reading in" << file.size() << "bytes"; 0040 QString trigram = stream.read(3); 0041 QString contents = stream.readAll(); 0042 qDebug() << "finished reading!"; 0043 qDebug() << "Building model..."; 0044 for (int i = 0; i < contents.size(); i++) { 0045 if (!contents[i].isPrint()) { 0046 continue; 0047 } 0048 model[trigram]++; 0049 trigram[0] = trigram[1]; 0050 trigram[1] = trigram[2]; 0051 trigram[2] = contents[i]; 0052 } 0053 qDebug() << "model built!"; 0054 0055 qDebug() << "Sorting..."; 0056 QMultiMap<int, QString> orderedTrigrams; 0057 0058 for (auto it = model.cbegin(); it != model.cend(); ++it) { 0059 const QString data = it.key(); 0060 Q_ASSERT(data.size() >= 3); 0061 bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) // 0062 || (data.size() > 2 && data[1].isSpace() && data[2].isSpace())); 0063 0064 if (!hasTwoSpaces) { 0065 orderedTrigrams.insert(it.value(), data); 0066 } 0067 } 0068 0069 qDebug() << "Sorted!"; 0070 0071 qDebug() << "Weeding out..."; 0072 0073 auto i = orderedTrigrams.begin(); 0074 while (orderedTrigrams.size() > Sonnet::MAXGRAMS) { 0075 i = orderedTrigrams.erase(i); 0076 } 0077 qDebug() << "Weeded!"; 0078 0079 qDebug() << "Storing..."; 0080 i = orderedTrigrams.end(); 0081 int count = 0; 0082 QTextStream outStream(&outFile); 0083 0084 while (i != orderedTrigrams.begin()) { 0085 --i; 0086 outStream << *i << "\t\t\t" << count++ << '\n'; 0087 } 0088 }