File indexing completed on 2024-05-19 05:05:47

0001 /***************************************************************************
0002  *   SPDX-License-Identifier: GPL-2.0-or-later
0003  *                                                                         *
0004  *   SPDX-FileCopyrightText: 2004-2019 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  *   This program is distributed in the hope that it will be useful,       *
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0014  *   GNU General Public License for more details.                          *
0015  *                                                                         *
0016  *   You should have received a copy of the GNU General Public License     *
0017  *   along with this program; if not, see <https://www.gnu.org/licenses/>. *
0018  ***************************************************************************/
0019 
0020 #include "findduplicates.h"
0021 
0022 #include <typeinfo>
0023 
0024 #include <QProgressDialog>
0025 #include <QApplication>
0026 #include <QDate>
0027 #include <QRegularExpression>
0028 
0029 #include <KLocalizedString>
0030 
0031 #include <File>
0032 #include <models/FileModel>
0033 #include <Entry>
0034 
0035 EntryClique::EntryClique()
0036 {
0037     /// nothing
0038 }
0039 
0040 int EntryClique::entryCount() const
0041 {
0042     return checkedEntries.count();
0043 }
0044 
0045 QList<QSharedPointer<Entry> > EntryClique::entryList() const
0046 {
0047     return checkedEntries.keys();
0048 }
0049 
0050 bool EntryClique::isEntryChecked(QSharedPointer<Entry> entry) const
0051 {
0052     return checkedEntries[entry];
0053 }
0054 
0055 void EntryClique::setEntryChecked(QSharedPointer<Entry> entry, bool isChecked)
0056 {
0057     checkedEntries[entry] = isChecked;
0058     recalculateValueMap();
0059 }
0060 
0061 int EntryClique::fieldCount() const
0062 {
0063     return valueMap.count();
0064 }
0065 
0066 QList<QString> EntryClique::fieldList() const
0067 {
0068     return valueMap.keys();
0069 }
0070 
0071 QVector<Value> EntryClique::values(const QString &field) const
0072 {
0073     return valueMap[field];
0074 }
0075 
0076 QVector<Value> &EntryClique::values(const QString &field)
0077 {
0078     return valueMap[field];
0079 }
0080 
0081 Value EntryClique::chosenValue(const QString &field) const
0082 {
0083     Q_ASSERT_X(chosenValueMap[field].count() == 1, "Value EntryClique::chosenValue(const QString &field) const", "Exactly one value expected in chosenValueMap");
0084     return chosenValueMap[field].first();
0085 }
0086 
0087 QVector<Value> EntryClique::chosenValues(const QString &field) const
0088 {
0089     return chosenValueMap[field];
0090 }
0091 
0092 void EntryClique::setChosenValue(const QString &field, const Value &value, ValueOperation valueOperation)
0093 {
0094     switch (valueOperation) {
0095     case ValueOperation::SetValue: {
0096         chosenValueMap[field].clear();
0097         chosenValueMap[field] << value;
0098         break;
0099     }
0100     case ValueOperation::AddValue: {
0101         QString text = PlainTextValue::text(value);
0102         for (const Value &value : const_cast<const QVector<Value> &>(chosenValueMap[field]))
0103             if (PlainTextValue::text(value) == text)
0104                 return;
0105         chosenValueMap[field] << value;
0106         break;
0107     }
0108     case ValueOperation::RemoveValue: {
0109         QString text = PlainTextValue::text(value);
0110         for (QVector<Value>::Iterator it = chosenValueMap[field].begin(); it != chosenValueMap[field].end(); ++it)
0111             if (PlainTextValue::text(*it) == text) {
0112                 chosenValueMap[field].erase(it);
0113                 return;
0114             }
0115         break;
0116     }
0117     }
0118 }
0119 
0120 void EntryClique::addEntry(QSharedPointer<Entry> entry)
0121 {
0122     checkedEntries.insert(entry, false); /// remember to call recalculateValueMap later
0123 }
0124 
0125 void EntryClique::recalculateValueMap()
0126 {
0127     valueMap.clear();
0128     chosenValueMap.clear();
0129 
0130     /// go through each and every entry ...
0131     const QList<QSharedPointer<Entry> > el = entryList();
0132     for (const auto &entry : el)
0133         if (isEntryChecked(entry)) {
0134 
0135             /// cover entry type
0136             Value v;
0137             v.append(QSharedPointer<VerbatimText>(new VerbatimText(entry->type())));
0138             insertKeyValueToValueMap(QStringLiteral("^type"), v, entry->type(), Qt::CaseInsensitive /** entry types shall be compared case insensitive */);
0139 
0140             /// cover entry id
0141             v.clear();
0142             v.append(QSharedPointer<VerbatimText>(new VerbatimText(entry->id())));
0143             insertKeyValueToValueMap(QStringLiteral("^id"), v, entry->id());
0144 
0145             /// go through each and every field of this entry
0146             for (Entry::ConstIterator fieldIt = entry->constBegin(); fieldIt != entry->constEnd(); ++fieldIt) {
0147                 /// store both field name and value for later reference
0148                 const QString fieldName = fieldIt.key().toLower();
0149                 const Value fieldValue = fieldIt.value();
0150 
0151                 if (fieldName == Entry::ftKeywords || fieldName == Entry::ftUrl) {
0152                     for (const auto &vi : fieldValue) {
0153                         const QString text = PlainTextValue::text(*vi);
0154                         Value v;
0155                         v << vi;
0156                         insertKeyValueToValueMap(fieldName, v, text);
0157                     }
0158                 } else {
0159                     const QString fieldValueText = PlainTextValue::text(fieldValue);
0160                     insertKeyValueToValueMap(fieldName, fieldValue, fieldValueText);
0161                 }
0162             }
0163         }
0164 
0165     const auto fl = fieldList();
0166     for (const QString &fieldName : fl)
0167         if (valueMap[fieldName].count() < 2) {
0168             valueMap.remove(fieldName);
0169             chosenValueMap.remove(fieldName);
0170         }
0171 }
0172 
0173 void EntryClique::insertKeyValueToValueMap(const QString &fieldName, const Value &fieldValue, const QString &fieldValueText, const Qt::CaseSensitivity)
0174 {
0175     if (fieldValueText.isEmpty()) return;
0176 
0177     if (valueMap.contains(fieldName)) {
0178         /// in the list of alternatives, search of a value identical
0179         /// to the current (as of fieldIt) value (to avoid duplicates)
0180 
0181         bool alreadyContained = false;
0182         QVector<Value> alternatives = valueMap[fieldName];
0183         for (const Value &v : const_cast<const QVector<Value> &>(alternatives))
0184             if (PlainTextValue::text(v) == fieldValueText) {
0185                 alreadyContained = true;
0186                 break;
0187             }
0188 
0189         if (!alreadyContained) {
0190             alternatives << fieldValue;
0191             valueMap[fieldName] = alternatives;
0192         }
0193     } else {
0194         QVector<Value> alternatives = valueMap[fieldName];
0195         alternatives << fieldValue;
0196         valueMap.insert(fieldName, alternatives);
0197         QVector<Value> chosen;
0198         chosen << fieldValue;
0199         chosenValueMap.insert(fieldName, chosen);
0200     }
0201 }
0202 
0203 class FindDuplicates::FindDuplicatesPrivate
0204 {
0205 private:
0206     const unsigned int maxDistance;
0207     int **d;
0208     static const int dsize;
0209 
0210 public:
0211     int sensitivity;
0212     QWidget *widget;
0213 
0214     FindDuplicatesPrivate(int sens, QWidget *w)
0215             : maxDistance(10000), sensitivity(sens), widget(w == nullptr ? qApp->activeWindow() : w) {
0216         d = new int *[dsize];
0217         for (int i = 0; i < dsize; ++i)
0218             d[i] = new int[dsize];
0219     }
0220 
0221     ~FindDuplicatesPrivate() {
0222         for (int i = 0; i < dsize; ++i) delete[] d[i];
0223         delete [] d;
0224     }
0225 
0226     /**
0227       * Determine the Levenshtein distance between two words.
0228       * See also https://en.wikipedia.org/wiki/Levenshtein_distance
0229       * @param s first word, all chars already in lower case
0230       * @param t second word, all chars already in lower case
0231       * @return distance between both words
0232       */
0233     double levenshteinDistanceWord(const QString &s, const QString &t) {
0234         const int m = qMin(s.length(), dsize - 1), n = qMin(t.length(), dsize - 1);
0235         if (m < 1 && n < 1) return 0.0;
0236         if (m < 1 || n < 1) return 1.0;
0237 
0238         for (int i = 0; i <= m; ++i)
0239             d[i][0] = i;
0240 
0241         for (int i = 0; i <= n; ++i) d[0][i] = i;
0242 
0243         for (int i = 1; i <= m; ++i)
0244             for (int j = 1; j <= n; ++j) {
0245                 d[i][j] = d[i - 1][j] + 1;
0246                 int c = d[i][j - 1] + 1;
0247                 if (c < d[i][j]) d[i][j] = c;
0248                 c = d[i - 1][j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1);
0249                 if (c < d[i][j]) d[i][j] = c;
0250             }
0251 
0252         double result = d[m][n];
0253 
0254         result = result / qMax(m, n);
0255         result *= result;
0256         return result;
0257     }
0258 
0259     /**
0260      * Determine the Levenshtein distance between two sentences (list of words).
0261      * See also https://en.wikipedia.org/wiki/Levenshtein_distance
0262      * @param s first sentence
0263      * @param t second sentence
0264      * @return distance between both sentences
0265      */
0266     double levenshteinDistance(const QStringList &s, const QStringList &t) {
0267         const int m = s.size(), n = t.size();
0268         if (m < 1 && n < 1) return 0.0;
0269         if (m < 1 || n < 1) return 1.0;
0270 
0271         double **d = new double*[m + 1];
0272         for (int i = 0; i <= m; ++i) {
0273             d[i] = new double[n + 1]; d[i][0] = i;
0274         }
0275         for (int i = 0; i <= n; ++i) d[0][i] = i;
0276 
0277         for (int i = 1; i <= m; ++i)
0278             for (int j = 1; j <= n; ++j) {
0279                 d[i][j] = d[i - 1][j] + 1;
0280                 double c = d[i][j - 1] + 1;
0281                 if (c < d[i][j]) d[i][j] = c;
0282                 c = d[i - 1][j - 1] + levenshteinDistanceWord(s[i - 1], t[j - 1]);
0283                 if (c < d[i][j]) d[i][j] = c;
0284             }
0285 
0286         double result = d[m][n];
0287         for (int i = 0; i <= m; ++i) delete[] d[i];
0288         delete [] d;
0289 
0290         result = result / qMax(m, n);
0291 
0292         return result;
0293     }
0294 
0295 
0296     /**
0297      * Determine the Levenshtein distance between two sentences,
0298      * where each sentence is in a string (not split into single words).
0299      * See also https://en.wikipedia.org/wiki/Levenshtein_distance
0300      * @param s first sentence
0301      * @param t second sentence
0302      * @return distance between both sentences
0303      */
0304     double levenshteinDistance(const QString &s, const QString &t) {
0305         static const QRegularExpression nonWordRegExp(QStringLiteral("[^a-z']+"), QRegularExpression::CaseInsensitiveOption);
0306         if (s.isEmpty() || t.isEmpty()) return 1.0;
0307 #if QT_VERSION >= 0x050e00
0308         return levenshteinDistance(s.toLower().split(nonWordRegExp, Qt::SkipEmptyParts), t.toLower().split(nonWordRegExp, Qt::SkipEmptyParts));
0309 #else // QT_VERSION < 0x050e00
0310         return levenshteinDistance(s.toLower().split(nonWordRegExp, QString::SkipEmptyParts), t.toLower().split(nonWordRegExp, QString::SkipEmptyParts));
0311 #endif // QT_VERSION >= 0x050e00
0312     }
0313 
0314     /**
0315      * Distance between two BibTeX entries, scaled by maxDistance.
0316      */
0317     int entryDistance(Entry *entryA, Entry *entryB) {
0318         /// "distance" to be used if no value for a field is given
0319         const double neutralDistance = 0.05;
0320 
0321         /**
0322          * Get both entries' titles. If both are empty, use a "neutral
0323          * distance" otherwise compute levenshtein distance (0.0 .. 1.0).
0324          */
0325         const QString titleA = PlainTextValue::text(entryA->value(Entry::ftTitle));
0326         const QString titleB = PlainTextValue::text(entryB->value(Entry::ftTitle));
0327         double titleDistance = titleA.isEmpty() && titleB.isEmpty() ? neutralDistance : levenshteinDistance(titleA, titleB);
0328 
0329         /**
0330          * Get both entries' author names. If both are empty, use a
0331          * "neutral distance" otherwise compute levenshtein distance
0332          * (0.0 .. 1.0).
0333          */
0334         const QString authorA = PlainTextValue::text(entryA->value(Entry::ftAuthor));
0335         const QString authorB = PlainTextValue::text(entryB->value(Entry::ftAuthor));
0336         double authorDistance = authorA.isEmpty() && authorB.isEmpty() ? neutralDistance : levenshteinDistance(authorA, authorB);
0337 
0338         /**
0339          * Get both entries' years. If both are empty, use a
0340          * "neutral distance" otherwise compute distance as follows:
0341          * take square of difference between both years, but impose
0342          * a maximum of 100. Divide value by 100.0 to get a distance
0343          * value of 0.0 .. 1.0.
0344          */
0345         const QString yearA = PlainTextValue::text(entryA->value(Entry::ftYear));
0346         const QString yearB = PlainTextValue::text(entryB->value(Entry::ftYear));
0347         bool yearAok = false, yearBok = false;
0348         int yearAint = yearA.toInt(&yearAok);
0349         int yearBint = yearB.toInt(&yearBok);
0350         double yearDistance = yearAok && yearBok ? qMin((yearBint - yearAint) * (yearBint - yearAint), 100) / 100.0 : neutralDistance;
0351 
0352         /**
0353          * Compute total distance by taking individual distances for
0354          * author, title, and year. Weight each individual distance as
0355          * follows: title => 60%, author => 30%, year => 10%
0356          * Scale distance by maximum distance and round to int; result
0357          * will be in range 0 .. maxDistance.
0358          */
0359         int distance = static_cast<int>(maxDistance * (titleDistance * 0.6 + authorDistance * 0.3 + yearDistance * 0.1) + 0.5);
0360 
0361         return distance;
0362     }
0363 
0364 };
0365 
0366 const int FindDuplicates::FindDuplicatesPrivate::dsize = 32;
0367 
0368 
0369 FindDuplicates::FindDuplicates(QWidget *parent, int sensitivity)
0370         : QObject(parent), d(new FindDuplicatesPrivate(sensitivity, parent))
0371 {
0372     /// nothing
0373 }
0374 
0375 FindDuplicates::~FindDuplicates()
0376 {
0377     delete d;
0378 }
0379 
0380 bool FindDuplicates::findDuplicateEntries(File *file, QVector<EntryClique *> &entryCliqueList)
0381 {
0382     QApplication::setOverrideCursor(Qt::WaitCursor);
0383     QScopedPointer<QProgressDialog> progressDlg(new QProgressDialog(i18n("Searching ..."), i18n("Cancel"), 0, 100000 /* to be set later to actual value */, d->widget));
0384     progressDlg->setModal(true);
0385     progressDlg->setWindowTitle(i18nc("@title:window", "Finding Duplicates"));
0386     progressDlg->setMinimumWidth(d->widget->fontMetrics().averageCharWidth() * 48);
0387     progressDlg->setAutoReset(false);
0388     entryCliqueList.clear();
0389 
0390     /// assemble list of entries only (ignoring comments, macros, ...)
0391     QVector<QSharedPointer<Entry> > listOfEntries;
0392     listOfEntries.reserve(file->size());
0393     for (const auto &element : const_cast<const File &>(*file)) {
0394         QSharedPointer<Entry> e = element.dynamicCast<Entry>();
0395         if (!e.isNull() && !e->isEmpty())
0396             listOfEntries << e;
0397     }
0398 
0399     if (listOfEntries.isEmpty()) {
0400         /// no entries to compare found
0401         entryCliqueList.clear();
0402         QApplication::restoreOverrideCursor();
0403         return progressDlg->wasCanceled();
0404     }
0405 
0406     int curProgress = 0, maxProgress = listOfEntries.count() * (listOfEntries.count() - 1) / 2;
0407     int progressDelta = 1;
0408 
0409     progressDlg->setMaximum(maxProgress);
0410     progressDlg->show();
0411 
0412     Q_EMIT maximumProgress(maxProgress);
0413 
0414     /// go through all entries ...
0415     for (const auto &entry : const_cast<const QVector<QSharedPointer<Entry> > &>(listOfEntries)) {
0416         QApplication::instance()->processEvents();
0417         if (progressDlg->wasCanceled()) {
0418             entryCliqueList.clear();
0419             break;
0420         }
0421 
0422         progressDlg->setValue(curProgress);
0423         Q_EMIT currentProgress(curProgress);
0424         /// ... and find a "clique" of entries where it will match, i.e. distance is below sensitivity
0425 
0426         /// assume current entry will match in no clique
0427         bool foundClique = false;
0428 
0429         /// go through all existing cliques
0430         for (QVector<EntryClique *>::Iterator cit = entryCliqueList.begin(); cit != entryCliqueList.end(); ++cit) {
0431             /// check distance between current entry and clique's first entry
0432             if (d->entryDistance(entry.data(), (*cit)->entryList().constFirst().data()) < d->sensitivity) {
0433                 /// if distance is below sensitivity, add current entry to clique
0434                 foundClique = true;
0435                 (*cit)->addEntry(entry);
0436                 break;
0437             }
0438 
0439             QApplication::instance()->processEvents();
0440             if (progressDlg->wasCanceled()) {
0441                 entryCliqueList.clear();
0442                 break;
0443             }
0444         }
0445 
0446         if (!progressDlg->wasCanceled() && !foundClique) {
0447             /// no clique matched to current entry, so create and add new clique
0448             /// consisting only of the current entry
0449             EntryClique *newClique = new EntryClique();
0450             newClique->addEntry(entry);
0451             entryCliqueList << newClique;
0452         }
0453 
0454         curProgress += progressDelta;
0455         ++progressDelta;
0456         progressDlg->setValue(curProgress);
0457 
0458         Q_EMIT currentProgress(curProgress);
0459     }
0460 
0461     progressDlg->setValue(progressDlg->maximum());
0462 
0463     /// remove cliques with only one element (nothing to merge here) from the list of cliques
0464     for (QVector<EntryClique *>::Iterator cit = entryCliqueList.begin(); cit != entryCliqueList.end();)
0465         if ((*cit)->entryCount() < 2) {
0466             EntryClique *ec = *cit;
0467             cit = entryCliqueList.erase(cit);
0468             delete ec;
0469         } else {
0470             /// entries have been inserted as checked,
0471             /// therefore recalculate alternatives
0472             (*cit)->recalculateValueMap();
0473 
0474             ++cit;
0475         }
0476 
0477     QApplication::restoreOverrideCursor();
0478     return progressDlg->wasCanceled();
0479 }
0480 
0481 
0482 MergeDuplicates::MergeDuplicates()
0483 {
0484     /// nothing
0485 }
0486 
0487 bool MergeDuplicates::mergeDuplicateEntries(const QVector<EntryClique *> &entryCliques, FileModel *fileModel)
0488 {
0489     bool didMerge = false;
0490 
0491     for (EntryClique *entryClique : entryCliques) {
0492         /// Avoid adding fields 20 lines below
0493         /// which have been remove (not added) 10 lines below
0494         QSet<QString> coveredFields;
0495 
0496         Entry *mergedEntry = new Entry(QString(), QString());
0497         const auto fieldList = entryClique->fieldList();
0498         coveredFields.reserve(fieldList.size());
0499         for (const auto &field : fieldList) {
0500             coveredFields << field;
0501             if (field == QStringLiteral("^id"))
0502                 mergedEntry->setId(PlainTextValue::text(entryClique->chosenValue(field)));
0503             else if (field == QStringLiteral("^type"))
0504                 mergedEntry->setType(PlainTextValue::text(entryClique->chosenValue(field)));
0505             else {
0506                 Value combined;
0507                 const auto chosenValues = entryClique->chosenValues(field);
0508                 for (const Value &v : chosenValues) {
0509                     combined.append(v);
0510                 }
0511                 if (!combined.isEmpty())
0512                     mergedEntry->insert(field, combined);
0513             }
0514         }
0515 
0516         bool actuallyMerged = false;
0517         int preferredInsertionRow = -1;
0518         const auto entryList = entryClique->entryList();
0519         for (const auto &entry : entryList) {
0520             /// if merging entries with identical ids, the merged entry will not yet have an id (is null)
0521             if (mergedEntry->id().isEmpty())
0522                 mergedEntry->setId(entry->id());
0523             /// if merging entries with identical types, the merged entry will not yet have an type (is null)
0524             if (mergedEntry->type().isEmpty())
0525                 mergedEntry->setType(entry->type());
0526 
0527             /// add all other fields not covered by user selection
0528             /// those fields did only occur in one entry (no conflict)
0529             /// may add a lot of bloat to merged entry
0530             if (entryClique->isEntryChecked(entry)) {
0531                 actuallyMerged = true;
0532                 for (Entry::ConstIterator it = entry->constBegin(); it != entry->constEnd(); ++it)
0533                     if (!mergedEntry->contains(it.key()) && !coveredFields.contains(it.key())) {
0534                         mergedEntry->insert(it.key(), it.value());
0535                         coveredFields << it.key();
0536                     }
0537                 const int row = fileModel->row(entry);
0538                 if (preferredInsertionRow < 0) preferredInsertionRow = row;
0539                 fileModel->removeRow(row);
0540             }
0541         }
0542 
0543         if (actuallyMerged) {
0544             if (preferredInsertionRow < 0) preferredInsertionRow = fileModel->rowCount();
0545             fileModel->insertRow(QSharedPointer<Entry>(mergedEntry), preferredInsertionRow);
0546         } else
0547             delete mergedEntry;
0548         didMerge |= actuallyMerged;
0549     }
0550 
0551     return didMerge;
0552 }