File indexing completed on 2024-04-28 03:40:32

0001 /***************************************************************************
0002  *   Copyright (C) 2002 by Gunnar Schmi Dt <kmouth@schmi-dt.de             *
0003  *             (C) 2015 by Jeremy Whiting <jpwhiting@kde.org>              *
0004  *                                                                         *
0005  *   This program is free software; you can redistribute it and/or modify  *
0006  *   it under the terms of the GNU General Public License as published by  *
0007  *   the Free Software Foundation; either version 2 of the License, or     *
0008  *   (at your option) any later version.                                   *
0009  *                                                                         *
0010  *   This program is distributed in the hope that it will be useful,       *
0011  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0012  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0013  *   GNU General Public License for more details.                          *
0014  *                                                                         *
0015  *   You should have received a copy of the GNU General Public License     *
0016  *   along with this program; if not, write to the                         *
0017  *   Free Software Foundation, Inc.,                                       *
0018  *   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.          *
0019  ***************************************************************************/
0020 
0021 #include "wordlist.h"
0022 
0023 #include <QApplication>
0024 #include <QDir>
0025 #include <QList>
0026 #include <QProgressDialog>
0027 #include <QRegularExpression>
0028 #include <QStandardPaths>
0029 #include <QTextStream>
0030 
0031 #include <KLocalizedString>
0032 
0033 namespace WordList
0034 {
0035 void addWords(WordMap &map, const QString &line);
0036 
0037 XMLReader::XMLReader()
0038 {
0039 }
0040 
0041 XMLReader::~XMLReader()
0042 {
0043 }
0044 
0045 bool XMLReader::read(QIODevice *device)
0046 {
0047     xml.setDevice(device);
0048 
0049     list.clear();
0050 
0051     while (xml.readNextStartElement()) {
0052         const QString word = xml.name().toString();
0053         addWords(list, word);
0054     }
0055 
0056     return !xml.error();
0057 }
0058 
0059 QString XMLReader::errorString() const
0060 {
0061     return QLatin1String("%1\nLine %2, column %3").arg(xml.errorString()).arg(xml.lineNumber()).arg(xml.columnNumber());
0062 }
0063 
0064 WordMap XMLReader::getList()
0065 {
0066     return list;
0067 }
0068 
0069 /***************************************************************************/
0070 
0071 QProgressDialog *progressDialog()
0072 {
0073     QProgressDialog *pdlg = new QProgressDialog(i18n("Creating Word List"), i18n("Parsing the KDE documentation..."), 0, 100);
0074     pdlg->setCancelButton(nullptr);
0075     pdlg->setAutoReset(false);
0076     pdlg->setAutoClose(false);
0077     return pdlg;
0078 }
0079 
0080 bool saveWordList(const WordMap &map, const QString &filename)
0081 {
0082     QFile file(filename);
0083     if (!file.open(QIODevice::WriteOnly))
0084         return false;
0085 
0086     QTextStream stream(&file);
0087     stream.setEncoding(QStringConverter::Utf8);
0088     stream << "WPDictFile\n";
0089     WordMap::ConstIterator it;
0090     for (it = map.constBegin(); it != map.constEnd(); ++it)
0091         stream << it.key() << "\t" << it.value() << "\t2\n";
0092     file.close();
0093     return true;
0094 }
0095 
0096 /***************************************************************************/
0097 
0098 void addWords(WordMap &map, const QString &line)
0099 {
0100     const QStringList words = line.split(QRegularExpression(QStringLiteral("\\W")));
0101 
0102     QStringList::ConstIterator it;
0103     for (it = words.constBegin(); it != words.constEnd(); ++it) {
0104         if (!(*it).contains(QRegularExpression(QStringLiteral("\\d|_")))) {
0105             QString key = (*it).toLower();
0106             if (map.contains(key))
0107                 map[key] += 1;
0108             else
0109                 map[key] = 1;
0110         }
0111     }
0112 }
0113 
0114 void addWords(WordMap &map, const WordMap &add)
0115 {
0116     WordList::WordMap::ConstIterator it;
0117     for (it = add.constBegin(); it != add.constEnd(); ++it)
0118         if (map.contains(it.key()))
0119             map[it.key()] += it.value();
0120         else
0121             map[it.key()] = it.value();
0122 }
0123 
0124 void addWordsFromFile(WordMap &map, const QString &filename, QStringConverter::Encoding encoding)
0125 {
0126     QFile xmlfile(filename);
0127     XMLReader reader;
0128 
0129     if (reader.read(&xmlfile)) // try to load the file as an xml-file
0130         addWords(map, reader.getList());
0131     else {
0132         QFile wpdfile(filename);
0133         if (wpdfile.open(QIODevice::ReadOnly)) {
0134             QTextStream stream(&wpdfile);
0135             stream.setEncoding(QStringConverter::Utf8);
0136 
0137             if (!stream.atEnd()) {
0138                 QString line = stream.readLine();
0139                 if (line == QLatin1String("WPDictFile")) { // Contains the file a weighted word list?
0140                     // We can assume that weighted word lists are always UTF8 coded.
0141                     while (!stream.atEnd()) {
0142                         QString s = stream.readLine();
0143                         if (!(s.isNull() || s.isEmpty())) {
0144                             QStringList list = s.split(QLatin1Char('\t'));
0145                             bool ok;
0146                             int weight = list[1].toInt(&ok);
0147                             if (ok && (weight > 0)) {
0148                                 if (map.contains(list[0]))
0149                                     map[list[0]] += weight;
0150                                 else
0151                                     map[list[0]] = weight;
0152                             }
0153                         }
0154                     }
0155                 } else { // Count the words in an ordinary text file
0156                     QFile file(filename);
0157                     if (file.open(QIODevice::ReadOnly)) {
0158                         QTextStream plainstream(&file);
0159                         plainstream.setEncoding(encoding);
0160                         while (!plainstream.atEnd())
0161                             addWords(map, plainstream.readLine());
0162                     }
0163                 }
0164             }
0165         }
0166     }
0167 }
0168 
0169 WordMap parseFiles(const QStringList &files, QStringConverter::Encoding encoding, QProgressDialog *pdlg)
0170 {
0171     int progress = 0;
0172     int steps = files.count();
0173     int percent = 0;
0174 
0175     WordMap map;
0176     QStringList::ConstIterator it;
0177     for (progress = 1, it = files.constBegin(); it != files.constEnd(); ++progress, ++it) {
0178         addWordsFromFile(map, *it, encoding);
0179 
0180         if (steps != 0 && progress * 100 / steps > percent) {
0181             percent = progress * 100 / steps;
0182             pdlg->setValue(percent);
0183             qApp->processEvents(QEventLoop::AllEvents, 20);
0184         }
0185     }
0186     return map;
0187 }
0188 
0189 WordMap mergeFiles(const QMap<QString, int> &files, QProgressDialog *pdlg)
0190 {
0191     pdlg->setLabelText(i18n("Merging dictionaries..."));
0192     pdlg->show();
0193     qApp->processEvents(QEventLoop::AllEvents, 20);
0194 
0195     int progress = 0;
0196     int steps = files.count();
0197     int percent = 0;
0198     float totalWeight = 0;
0199     long long maxWeight = 0;
0200 
0201     QMap<QString, float> map;
0202     QMap<QString, int>::ConstIterator it;
0203     for (progress = 1, it = files.constBegin(); it != files.constEnd(); ++progress, ++it) {
0204         WordMap fileMap;
0205         addWordsFromFile(fileMap, it.key(), QStringConverter::Utf8);
0206 
0207         long long weight = 0;
0208         WordMap::ConstIterator iter;
0209         for (iter = fileMap.constBegin(); iter != fileMap.constEnd(); ++iter)
0210             weight += iter.value();
0211         float factor = 1.0 * it.value() / weight;
0212         totalWeight += it.value();
0213         if (weight > maxWeight)
0214             maxWeight = weight;
0215 
0216         for (iter = fileMap.constBegin(); iter != fileMap.constEnd(); ++iter)
0217             if (map.contains(iter.key()))
0218                 map[iter.key()] += iter.value() * factor;
0219             else
0220                 map[iter.key()] = iter.value() * factor;
0221 
0222         if (steps != 0 && progress * 100 / steps > percent) {
0223             percent = progress * 100 / steps;
0224             pdlg->setValue(percent);
0225             qApp->processEvents(QEventLoop::AllEvents, 20);
0226         }
0227     }
0228 
0229     float factor;
0230     if (1.0 * maxWeight * totalWeight > 1000000000)
0231         factor = 1000000000 / totalWeight;
0232     else
0233         factor = 1.0 * maxWeight;
0234 
0235     WordMap resultMap;
0236     QMap<QString, float>::ConstIterator iter;
0237     for (iter = map.constBegin(); iter != map.constEnd(); ++iter)
0238         resultMap[iter.key()] = (int)(factor * iter.value() + 0.5);
0239 
0240     return resultMap;
0241 }
0242 
0243 WordMap parseKDEDoc(QString language, QProgressDialog *pdlg)
0244 {
0245     pdlg->setLabelText(i18n("Parsing the KDE documentation..."));
0246     pdlg->show();
0247     qApp->processEvents(QEventLoop::AllEvents, 20);
0248 
0249     QStringList files = QStandardPaths::locateAll(QStandardPaths::GenericDataLocation, QStringLiteral("html/") + language + QStringLiteral("/*.docbook"));
0250     if ((files.count() == 0) && (language.length() == 5)) {
0251         language = language.left(2);
0252         files = QStandardPaths::locateAll(QStandardPaths::GenericDataLocation, QStringLiteral("html/") + language + QStringLiteral("/*.docbook"));
0253     }
0254 
0255     return parseFiles(files, QStringConverter::Utf8, pdlg);
0256 }
0257 
0258 WordMap parseFile(const QString &filename, QStringConverter::Encoding encoding, QProgressDialog *pdlg)
0259 {
0260     pdlg->setLabelText(i18n("Parsing file..."));
0261     pdlg->show();
0262     qApp->processEvents(QEventLoop::AllEvents, 20);
0263 
0264     QStringList files;
0265     files.append(filename);
0266 
0267     return parseFiles(files, encoding, pdlg);
0268 }
0269 
0270 WordMap parseDir(const QString &directory, QStringConverter::Encoding encoding, QProgressDialog *pdlg)
0271 {
0272     pdlg->setLabelText(i18n("Parsing directory..."));
0273     pdlg->show();
0274     qApp->processEvents(QEventLoop::AllEvents, 20);
0275 
0276     QStringList directories;
0277     directories += directory;
0278     QStringList files;
0279     int dirNdx = 0;
0280     while (dirNdx < directories.count()) {
0281         QDir dir(directories.at(dirNdx));
0282         const QFileInfoList entries = dir.entryInfoList(QDir::Dirs | QDir::Files | QDir::NoSymLinks | QDir::Readable);
0283 
0284         for (int i = 0; i < entries.size(); ++i) {
0285             QFileInfo fileInfo = entries.at(i);
0286 
0287             QString name = fileInfo.fileName();
0288             if (name != QLatin1String(".") && name != QLatin1String("..")) {
0289                 if (fileInfo.isDir())
0290                     directories += fileInfo.filePath();
0291                 else
0292                     files += fileInfo.filePath();
0293             }
0294         }
0295         directories.removeAt(dirNdx);
0296     }
0297 
0298     return parseFiles(files, encoding, pdlg);
0299 }
0300 
0301 /***************************************************************************/
0302 
0303 /* Structures used for storing *.aff files (part of OpenOffice.org dictionaries)
0304  */
0305 struct AffEntry {
0306     bool cross;
0307     int charsToRemove;
0308     QString add;
0309     QStringList condition;
0310 };
0311 
0312 typedef QList<AffEntry> AffList;
0313 typedef QMap<QChar, AffList> AffMap;
0314 
0315 /** Loads an *.aff file (part of OpenOffice.org dictionaries)
0316  */
0317 void loadAffFile(const QString &filename, AffMap &prefixes, AffMap &suffixes)
0318 {
0319     QFile afile(filename);
0320     if (afile.open(QIODevice::ReadOnly)) {
0321         QTextStream stream(&afile);
0322         while (!stream.atEnd()) {
0323             QString s = stream.readLine();
0324             QStringList fields = s.split(QRegularExpression(QStringLiteral("\\s")));
0325 
0326             bool cross = false;
0327             if (fields.count() == 4) {
0328                 cross = (fields[2] == QLatin1String("Y"));
0329             } else {
0330                 if (fields.count() >= 5) {
0331                     AffEntry e;
0332                     e.cross = cross;
0333                     if (fields[2] == QLatin1String("0"))
0334                         e.charsToRemove = 0;
0335                     else
0336                         e.charsToRemove = fields[2].length();
0337                     e.add = fields[3];
0338 
0339                     if (fields[4] != QLatin1String(".")) {
0340                         QString condition = fields[4];
0341                         for (int idx = 0; idx < condition.length(); ++idx) {
0342                             if (condition[idx] == QLatin1Char('[')) {
0343                                 QString code;
0344                                 for (++idx; (idx < condition.length()) && condition[idx] != QLatin1Char(']'); ++idx)
0345                                     code += condition[idx];
0346                                 e.condition << code;
0347                             } else
0348                                 e.condition << QString(condition[idx]);
0349                         }
0350                     }
0351 
0352                     if (s.startsWith(QLatin1String("PFX"))) {
0353                         AffList list;
0354                         if (prefixes.contains(fields[1][0]))
0355                             list = prefixes[fields[1][0]];
0356                         list << e;
0357                         prefixes[fields[1][0]] = list;
0358                     } else if (s.startsWith(QLatin1String("SFX"))) {
0359                         AffList list;
0360                         if (suffixes.contains(fields[1][0]))
0361                             list = suffixes[fields[1][0]];
0362                         list << e;
0363                         suffixes[fields[1][0]] = list;
0364                     }
0365                 }
0366             }
0367         }
0368     }
0369 }
0370 
0371 /** Checks if the given word matches the given condition. Each entry of the
0372  * QStringList "condition" describes one character of the word. (If the word
0373  * has more characters than condition entries only the last characters are
0374  * compared).
0375  * Each entry contains either all valid characters (if it does _not_ start
0376  * with "^") or all invalid characters (if it starts with "^").
0377  */
0378 inline bool checkCondition(const QString &word, const QStringList &condition)
0379 {
0380     if (condition.count() == 0)
0381         return true;
0382 
0383     if (word.length() < condition.count())
0384         return false;
0385 
0386     QStringList::ConstIterator it;
0387     int idx;
0388     for (it = condition.constBegin(), idx = word.length() - condition.count(); it != condition.constEnd(); ++it, ++idx) {
0389         if ((*it).contains(word[idx]) == ((*it)[0] == QLatin1Char('^')))
0390             return false;
0391     }
0392     return true;
0393 }
0394 
0395 /** Constructs words by adding suffixes to the given word, and copies the
0396  * resulting words from map to checkedMap.
0397  * @param modifiers discribes which suffixes are valid
0398  * @param cross true if the word has a prefix
0399  */
0400 inline void checkWord(const QString &word, const QString &modifiers, bool cross, const WordMap &map, WordMap &checkedMap, const AffMap &suffixes)
0401 {
0402     for (int i = 0; i < modifiers.length(); i++) {
0403         if (suffixes.contains(modifiers[i])) {
0404             AffList sList = suffixes[modifiers[i]];
0405 
0406             AffList::ConstIterator sIt;
0407             for (sIt = sList.constBegin(); sIt != sList.constEnd(); ++sIt) {
0408                 if (((*sIt).cross || !cross) && (checkCondition(word, (*sIt).condition))) {
0409                     QString sWord = word.left(word.length() - (*sIt).charsToRemove) + (*sIt).add;
0410                     if (map.contains(sWord))
0411                         checkedMap[sWord] = map[sWord];
0412                 }
0413             }
0414         }
0415     }
0416 }
0417 
0418 /** Constructs words by adding pre- and suffixes to the given word, and
0419  * copies the resulting words from map to checkedMap.
0420  * @param modifiers discribes which pre- and suffixes are valid
0421  */
0422 void checkWord(const QString &word, const QString &modifiers, const WordMap &map, WordMap &checkedMap, const AffMap &prefixes, const AffMap &suffixes)
0423 {
0424     if (map.contains(word))
0425         checkedMap[word] = map[word];
0426 
0427     checkWord(word, modifiers, true, map, checkedMap, suffixes);
0428 
0429     for (int i = 0; i < modifiers.length(); i++) {
0430         if (prefixes.contains(modifiers[i])) {
0431             AffList pList = prefixes[modifiers[i]];
0432 
0433             AffList::ConstIterator pIt;
0434             for (pIt = pList.constBegin(); pIt != pList.constEnd(); ++pIt) {
0435                 QString pWord = (*pIt).add + word;
0436                 if (map.contains(pWord))
0437                     checkedMap[pWord] = map[pWord];
0438 
0439                 checkWord(pWord, modifiers, false, map, checkedMap, suffixes);
0440             }
0441         }
0442     }
0443 }
0444 
0445 WordMap spellCheck(WordMap map, const QString &dictionary, QProgressDialog *pdlg)
0446 {
0447     if (dictionary.endsWith(QLatin1String(".dic"))) {
0448         AffMap prefixes;
0449         AffMap suffixes;
0450         WordMap checkedMap;
0451         loadAffFile(dictionary.left(dictionary.length() - 4) + QStringLiteral(".aff"), prefixes, suffixes);
0452 
0453         pdlg->reset();
0454         // pdlg->setAllowCancel (false);
0455         // pdlg->showCancelButton (false);
0456         pdlg->setAutoReset(false);
0457         pdlg->setAutoClose(false);
0458         pdlg->setLabelText(i18n("Performing spell check..."));
0459         pdlg->setMaximum(100);
0460         pdlg->setValue(0);
0461         qApp->processEvents(QEventLoop::AllEvents, 20);
0462 
0463         QFile dfile(dictionary);
0464         if (dfile.open(QIODevice::ReadOnly)) {
0465             QTextStream stream(&dfile);
0466             int progress = 0;
0467             int steps = 0;
0468             int percent = 0;
0469 
0470             if (!stream.atEnd()) {
0471                 QString s = stream.readLine(); // Number of words
0472                 steps = s.toInt();
0473             }
0474 
0475             while (!stream.atEnd()) {
0476                 QString s = stream.readLine();
0477                 if (s.contains(QLatin1Char('/'))) {
0478                     QString word = s.left(s.indexOf(QLatin1Char('/'))).toLower();
0479                     QString modifiers = s.right(s.length() - s.indexOf(QLatin1Char('/')));
0480 
0481                     checkWord(word, modifiers, map, checkedMap, prefixes, suffixes);
0482                 } else {
0483                     if (!s.isEmpty() && !s.isNull() && map.contains(s.toLower()))
0484                         checkedMap[s.toLower()] = map[s.toLower()];
0485                 }
0486 
0487                 progress++;
0488                 if (steps != 0 && progress * 100 / steps > percent) {
0489                     percent = progress * 100 / steps;
0490                     pdlg->setValue(percent);
0491                     qApp->processEvents(QEventLoop::AllEvents, 20);
0492                 }
0493             }
0494         }
0495 
0496         return checkedMap;
0497     } else
0498         return map;
0499 }
0500 
0501 }