File indexing completed on 2025-04-27 10:16:11
0001 /** 0002 * parsetrigrams.cpp 0003 * 0004 * Parse a corpus of data and generate trigrams 0005 * 0006 * SPDX-FileCopyrightText: 2013 Martin Sandsmark <martin.sandsmark@kde.org> 0007 * 0008 * SPDX-License-Identifier: LGPL-2.1-or-later 0009 */ 0010 0011 #include "guesslanguage.h" 0012 0013 #include <QDebug> 0014 #include <QFile> 0015 #include <QHash> 0016 #include <QString> 0017 0018 int main(int argc, char *argv[]) 0019 { 0020 if (argc < 3) { 0021 qWarning() << argv[0] << "corpus.txt outfile.trigram"; 0022 return -1; 0023 } 0024 0025 QFile file(QString::fromLocal8Bit(argv[1])); 0026 if (!file.open(QIODevice::ReadOnly | QFile::Text)) { 0027 qWarning() << "Unable to open corpus:" << argv[1]; 0028 return -1; 0029 } 0030 QTextStream stream(&file); 0031 // Not needed with Qt6, UTF-8 is the default 0032 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) 0033 stream.setCodec("UTF-8"); 0034 #endif 0035 0036 QFile outFile(QString::fromLocal8Bit(argv[2])); 0037 if (!outFile.open(QIODevice::WriteOnly)) { 0038 qWarning() << "Unable to open output file" << argv[2]; 0039 return -1; 0040 } 0041 0042 QHash<QString, int> model; 0043 qDebug() << "Reading in" << file.size() << "bytes"; 0044 QString trigram = stream.read(3); 0045 QString contents = stream.readAll(); 0046 qDebug() << "finished reading!"; 0047 qDebug() << "Building model..."; 0048 for (int i = 0; i < contents.size(); i++) { 0049 if (!contents[i].isPrint()) { 0050 continue; 0051 } 0052 model[trigram]++; 0053 trigram[0] = trigram[1]; 0054 trigram[1] = trigram[2]; 0055 trigram[2] = contents[i]; 0056 } 0057 qDebug() << "model built!"; 0058 0059 qDebug() << "Sorting..."; 0060 QMultiMap<int, QString> orderedTrigrams; 0061 0062 for (auto it = model.cbegin(); it != model.cend(); ++it) { 0063 const QString data = it.key(); 0064 Q_ASSERT(data.size() >= 3); 0065 bool hasTwoSpaces = ((data.size() > 1 && data[0].isSpace() && data[1].isSpace()) // 0066 || (data.size() > 2 && data[1].isSpace() && data[2].isSpace())); 0067 0068 if (!hasTwoSpaces) { 0069 orderedTrigrams.insert(it.value(), data); 0070 } 0071 } 0072 0073 qDebug() << "Sorted!"; 0074 0075 qDebug() << "Weeding out..."; 0076 0077 auto i = orderedTrigrams.begin(); 0078 while (orderedTrigrams.size() > Sonnet::MAXGRAMS) { 0079 i = orderedTrigrams.erase(i); 0080 } 0081 qDebug() << "Weeded!"; 0082 0083 qDebug() << "Storing..."; 0084 i = orderedTrigrams.end(); 0085 int count = 0; 0086 QTextStream outStream(&outFile); 0087 // Not needed with Qt6, UTF-8 is the default 0088 #if QT_VERSION < QT_VERSION_CHECK(6, 0, 0) 0089 outStream.setCodec("UTF-8"); 0090 #endif 0091 0092 while (i != orderedTrigrams.begin()) { 0093 --i; 0094 outStream << *i << "\t\t\t" << count++ << '\n'; 0095 } 0096 }