File indexing completed on 2024-05-19 05:05:47
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2004-2019 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "findduplicates.h" 0021 0022 #include <typeinfo> 0023 0024 #include <QProgressDialog> 0025 #include <QApplication> 0026 #include <QDate> 0027 #include <QRegularExpression> 0028 0029 #include <KLocalizedString> 0030 0031 #include <File> 0032 #include <models/FileModel> 0033 #include <Entry> 0034 0035 EntryClique::EntryClique() 0036 { 0037 /// nothing 0038 } 0039 0040 int EntryClique::entryCount() const 0041 { 0042 return checkedEntries.count(); 0043 } 0044 0045 QList<QSharedPointer<Entry> > EntryClique::entryList() const 0046 { 0047 return checkedEntries.keys(); 0048 } 0049 0050 bool EntryClique::isEntryChecked(QSharedPointer<Entry> entry) const 0051 { 0052 return checkedEntries[entry]; 0053 } 0054 0055 void EntryClique::setEntryChecked(QSharedPointer<Entry> entry, bool isChecked) 0056 { 0057 checkedEntries[entry] = isChecked; 0058 recalculateValueMap(); 0059 } 0060 0061 int EntryClique::fieldCount() const 0062 { 0063 return valueMap.count(); 0064 } 0065 0066 QList<QString> EntryClique::fieldList() const 0067 { 0068 return valueMap.keys(); 0069 } 0070 0071 QVector<Value> EntryClique::values(const QString &field) const 0072 { 0073 return valueMap[field]; 0074 } 0075 0076 QVector<Value> &EntryClique::values(const QString &field) 0077 { 0078 return valueMap[field]; 0079 } 0080 0081 Value EntryClique::chosenValue(const QString &field) const 0082 { 0083 Q_ASSERT_X(chosenValueMap[field].count() == 1, "Value EntryClique::chosenValue(const QString &field) const", "Exactly one value expected in chosenValueMap"); 0084 return chosenValueMap[field].first(); 0085 } 0086 0087 QVector<Value> EntryClique::chosenValues(const QString &field) const 0088 { 0089 return chosenValueMap[field]; 0090 } 0091 0092 void EntryClique::setChosenValue(const QString &field, const Value &value, ValueOperation valueOperation) 0093 { 0094 switch (valueOperation) { 0095 case ValueOperation::SetValue: { 0096 chosenValueMap[field].clear(); 0097 chosenValueMap[field] << value; 0098 break; 0099 } 0100 case ValueOperation::AddValue: { 0101 QString text = PlainTextValue::text(value); 0102 for (const Value &value : const_cast<const QVector<Value> &>(chosenValueMap[field])) 0103 if (PlainTextValue::text(value) == text) 0104 return; 0105 chosenValueMap[field] << value; 0106 break; 0107 } 0108 case ValueOperation::RemoveValue: { 0109 QString text = PlainTextValue::text(value); 0110 for (QVector<Value>::Iterator it = chosenValueMap[field].begin(); it != chosenValueMap[field].end(); ++it) 0111 if (PlainTextValue::text(*it) == text) { 0112 chosenValueMap[field].erase(it); 0113 return; 0114 } 0115 break; 0116 } 0117 } 0118 } 0119 0120 void EntryClique::addEntry(QSharedPointer<Entry> entry) 0121 { 0122 checkedEntries.insert(entry, false); /// remember to call recalculateValueMap later 0123 } 0124 0125 void EntryClique::recalculateValueMap() 0126 { 0127 valueMap.clear(); 0128 chosenValueMap.clear(); 0129 0130 /// go through each and every entry ... 0131 const QList<QSharedPointer<Entry> > el = entryList(); 0132 for (const auto &entry : el) 0133 if (isEntryChecked(entry)) { 0134 0135 /// cover entry type 0136 Value v; 0137 v.append(QSharedPointer<VerbatimText>(new VerbatimText(entry->type()))); 0138 insertKeyValueToValueMap(QStringLiteral("^type"), v, entry->type(), Qt::CaseInsensitive /** entry types shall be compared case insensitive */); 0139 0140 /// cover entry id 0141 v.clear(); 0142 v.append(QSharedPointer<VerbatimText>(new VerbatimText(entry->id()))); 0143 insertKeyValueToValueMap(QStringLiteral("^id"), v, entry->id()); 0144 0145 /// go through each and every field of this entry 0146 for (Entry::ConstIterator fieldIt = entry->constBegin(); fieldIt != entry->constEnd(); ++fieldIt) { 0147 /// store both field name and value for later reference 0148 const QString fieldName = fieldIt.key().toLower(); 0149 const Value fieldValue = fieldIt.value(); 0150 0151 if (fieldName == Entry::ftKeywords || fieldName == Entry::ftUrl) { 0152 for (const auto &vi : fieldValue) { 0153 const QString text = PlainTextValue::text(*vi); 0154 Value v; 0155 v << vi; 0156 insertKeyValueToValueMap(fieldName, v, text); 0157 } 0158 } else { 0159 const QString fieldValueText = PlainTextValue::text(fieldValue); 0160 insertKeyValueToValueMap(fieldName, fieldValue, fieldValueText); 0161 } 0162 } 0163 } 0164 0165 const auto fl = fieldList(); 0166 for (const QString &fieldName : fl) 0167 if (valueMap[fieldName].count() < 2) { 0168 valueMap.remove(fieldName); 0169 chosenValueMap.remove(fieldName); 0170 } 0171 } 0172 0173 void EntryClique::insertKeyValueToValueMap(const QString &fieldName, const Value &fieldValue, const QString &fieldValueText, const Qt::CaseSensitivity) 0174 { 0175 if (fieldValueText.isEmpty()) return; 0176 0177 if (valueMap.contains(fieldName)) { 0178 /// in the list of alternatives, search of a value identical 0179 /// to the current (as of fieldIt) value (to avoid duplicates) 0180 0181 bool alreadyContained = false; 0182 QVector<Value> alternatives = valueMap[fieldName]; 0183 for (const Value &v : const_cast<const QVector<Value> &>(alternatives)) 0184 if (PlainTextValue::text(v) == fieldValueText) { 0185 alreadyContained = true; 0186 break; 0187 } 0188 0189 if (!alreadyContained) { 0190 alternatives << fieldValue; 0191 valueMap[fieldName] = alternatives; 0192 } 0193 } else { 0194 QVector<Value> alternatives = valueMap[fieldName]; 0195 alternatives << fieldValue; 0196 valueMap.insert(fieldName, alternatives); 0197 QVector<Value> chosen; 0198 chosen << fieldValue; 0199 chosenValueMap.insert(fieldName, chosen); 0200 } 0201 } 0202 0203 class FindDuplicates::FindDuplicatesPrivate 0204 { 0205 private: 0206 const unsigned int maxDistance; 0207 int **d; 0208 static const int dsize; 0209 0210 public: 0211 int sensitivity; 0212 QWidget *widget; 0213 0214 FindDuplicatesPrivate(int sens, QWidget *w) 0215 : maxDistance(10000), sensitivity(sens), widget(w == nullptr ? qApp->activeWindow() : w) { 0216 d = new int *[dsize]; 0217 for (int i = 0; i < dsize; ++i) 0218 d[i] = new int[dsize]; 0219 } 0220 0221 ~FindDuplicatesPrivate() { 0222 for (int i = 0; i < dsize; ++i) delete[] d[i]; 0223 delete [] d; 0224 } 0225 0226 /** 0227 * Determine the Levenshtein distance between two words. 0228 * See also https://en.wikipedia.org/wiki/Levenshtein_distance 0229 * @param s first word, all chars already in lower case 0230 * @param t second word, all chars already in lower case 0231 * @return distance between both words 0232 */ 0233 double levenshteinDistanceWord(const QString &s, const QString &t) { 0234 const int m = qMin(s.length(), dsize - 1), n = qMin(t.length(), dsize - 1); 0235 if (m < 1 && n < 1) return 0.0; 0236 if (m < 1 || n < 1) return 1.0; 0237 0238 for (int i = 0; i <= m; ++i) 0239 d[i][0] = i; 0240 0241 for (int i = 0; i <= n; ++i) d[0][i] = i; 0242 0243 for (int i = 1; i <= m; ++i) 0244 for (int j = 1; j <= n; ++j) { 0245 d[i][j] = d[i - 1][j] + 1; 0246 int c = d[i][j - 1] + 1; 0247 if (c < d[i][j]) d[i][j] = c; 0248 c = d[i - 1][j - 1] + (s[i - 1] == t[j - 1] ? 0 : 1); 0249 if (c < d[i][j]) d[i][j] = c; 0250 } 0251 0252 double result = d[m][n]; 0253 0254 result = result / qMax(m, n); 0255 result *= result; 0256 return result; 0257 } 0258 0259 /** 0260 * Determine the Levenshtein distance between two sentences (list of words). 0261 * See also https://en.wikipedia.org/wiki/Levenshtein_distance 0262 * @param s first sentence 0263 * @param t second sentence 0264 * @return distance between both sentences 0265 */ 0266 double levenshteinDistance(const QStringList &s, const QStringList &t) { 0267 const int m = s.size(), n = t.size(); 0268 if (m < 1 && n < 1) return 0.0; 0269 if (m < 1 || n < 1) return 1.0; 0270 0271 double **d = new double*[m + 1]; 0272 for (int i = 0; i <= m; ++i) { 0273 d[i] = new double[n + 1]; d[i][0] = i; 0274 } 0275 for (int i = 0; i <= n; ++i) d[0][i] = i; 0276 0277 for (int i = 1; i <= m; ++i) 0278 for (int j = 1; j <= n; ++j) { 0279 d[i][j] = d[i - 1][j] + 1; 0280 double c = d[i][j - 1] + 1; 0281 if (c < d[i][j]) d[i][j] = c; 0282 c = d[i - 1][j - 1] + levenshteinDistanceWord(s[i - 1], t[j - 1]); 0283 if (c < d[i][j]) d[i][j] = c; 0284 } 0285 0286 double result = d[m][n]; 0287 for (int i = 0; i <= m; ++i) delete[] d[i]; 0288 delete [] d; 0289 0290 result = result / qMax(m, n); 0291 0292 return result; 0293 } 0294 0295 0296 /** 0297 * Determine the Levenshtein distance between two sentences, 0298 * where each sentence is in a string (not split into single words). 0299 * See also https://en.wikipedia.org/wiki/Levenshtein_distance 0300 * @param s first sentence 0301 * @param t second sentence 0302 * @return distance between both sentences 0303 */ 0304 double levenshteinDistance(const QString &s, const QString &t) { 0305 static const QRegularExpression nonWordRegExp(QStringLiteral("[^a-z']+"), QRegularExpression::CaseInsensitiveOption); 0306 if (s.isEmpty() || t.isEmpty()) return 1.0; 0307 #if QT_VERSION >= 0x050e00 0308 return levenshteinDistance(s.toLower().split(nonWordRegExp, Qt::SkipEmptyParts), t.toLower().split(nonWordRegExp, Qt::SkipEmptyParts)); 0309 #else // QT_VERSION < 0x050e00 0310 return levenshteinDistance(s.toLower().split(nonWordRegExp, QString::SkipEmptyParts), t.toLower().split(nonWordRegExp, QString::SkipEmptyParts)); 0311 #endif // QT_VERSION >= 0x050e00 0312 } 0313 0314 /** 0315 * Distance between two BibTeX entries, scaled by maxDistance. 0316 */ 0317 int entryDistance(Entry *entryA, Entry *entryB) { 0318 /// "distance" to be used if no value for a field is given 0319 const double neutralDistance = 0.05; 0320 0321 /** 0322 * Get both entries' titles. If both are empty, use a "neutral 0323 * distance" otherwise compute levenshtein distance (0.0 .. 1.0). 0324 */ 0325 const QString titleA = PlainTextValue::text(entryA->value(Entry::ftTitle)); 0326 const QString titleB = PlainTextValue::text(entryB->value(Entry::ftTitle)); 0327 double titleDistance = titleA.isEmpty() && titleB.isEmpty() ? neutralDistance : levenshteinDistance(titleA, titleB); 0328 0329 /** 0330 * Get both entries' author names. If both are empty, use a 0331 * "neutral distance" otherwise compute levenshtein distance 0332 * (0.0 .. 1.0). 0333 */ 0334 const QString authorA = PlainTextValue::text(entryA->value(Entry::ftAuthor)); 0335 const QString authorB = PlainTextValue::text(entryB->value(Entry::ftAuthor)); 0336 double authorDistance = authorA.isEmpty() && authorB.isEmpty() ? neutralDistance : levenshteinDistance(authorA, authorB); 0337 0338 /** 0339 * Get both entries' years. If both are empty, use a 0340 * "neutral distance" otherwise compute distance as follows: 0341 * take square of difference between both years, but impose 0342 * a maximum of 100. Divide value by 100.0 to get a distance 0343 * value of 0.0 .. 1.0. 0344 */ 0345 const QString yearA = PlainTextValue::text(entryA->value(Entry::ftYear)); 0346 const QString yearB = PlainTextValue::text(entryB->value(Entry::ftYear)); 0347 bool yearAok = false, yearBok = false; 0348 int yearAint = yearA.toInt(&yearAok); 0349 int yearBint = yearB.toInt(&yearBok); 0350 double yearDistance = yearAok && yearBok ? qMin((yearBint - yearAint) * (yearBint - yearAint), 100) / 100.0 : neutralDistance; 0351 0352 /** 0353 * Compute total distance by taking individual distances for 0354 * author, title, and year. Weight each individual distance as 0355 * follows: title => 60%, author => 30%, year => 10% 0356 * Scale distance by maximum distance and round to int; result 0357 * will be in range 0 .. maxDistance. 0358 */ 0359 int distance = static_cast<int>(maxDistance * (titleDistance * 0.6 + authorDistance * 0.3 + yearDistance * 0.1) + 0.5); 0360 0361 return distance; 0362 } 0363 0364 }; 0365 0366 const int FindDuplicates::FindDuplicatesPrivate::dsize = 32; 0367 0368 0369 FindDuplicates::FindDuplicates(QWidget *parent, int sensitivity) 0370 : QObject(parent), d(new FindDuplicatesPrivate(sensitivity, parent)) 0371 { 0372 /// nothing 0373 } 0374 0375 FindDuplicates::~FindDuplicates() 0376 { 0377 delete d; 0378 } 0379 0380 bool FindDuplicates::findDuplicateEntries(File *file, QVector<EntryClique *> &entryCliqueList) 0381 { 0382 QApplication::setOverrideCursor(Qt::WaitCursor); 0383 QScopedPointer<QProgressDialog> progressDlg(new QProgressDialog(i18n("Searching ..."), i18n("Cancel"), 0, 100000 /* to be set later to actual value */, d->widget)); 0384 progressDlg->setModal(true); 0385 progressDlg->setWindowTitle(i18nc("@title:window", "Finding Duplicates")); 0386 progressDlg->setMinimumWidth(d->widget->fontMetrics().averageCharWidth() * 48); 0387 progressDlg->setAutoReset(false); 0388 entryCliqueList.clear(); 0389 0390 /// assemble list of entries only (ignoring comments, macros, ...) 0391 QVector<QSharedPointer<Entry> > listOfEntries; 0392 listOfEntries.reserve(file->size()); 0393 for (const auto &element : const_cast<const File &>(*file)) { 0394 QSharedPointer<Entry> e = element.dynamicCast<Entry>(); 0395 if (!e.isNull() && !e->isEmpty()) 0396 listOfEntries << e; 0397 } 0398 0399 if (listOfEntries.isEmpty()) { 0400 /// no entries to compare found 0401 entryCliqueList.clear(); 0402 QApplication::restoreOverrideCursor(); 0403 return progressDlg->wasCanceled(); 0404 } 0405 0406 int curProgress = 0, maxProgress = listOfEntries.count() * (listOfEntries.count() - 1) / 2; 0407 int progressDelta = 1; 0408 0409 progressDlg->setMaximum(maxProgress); 0410 progressDlg->show(); 0411 0412 Q_EMIT maximumProgress(maxProgress); 0413 0414 /// go through all entries ... 0415 for (const auto &entry : const_cast<const QVector<QSharedPointer<Entry> > &>(listOfEntries)) { 0416 QApplication::instance()->processEvents(); 0417 if (progressDlg->wasCanceled()) { 0418 entryCliqueList.clear(); 0419 break; 0420 } 0421 0422 progressDlg->setValue(curProgress); 0423 Q_EMIT currentProgress(curProgress); 0424 /// ... and find a "clique" of entries where it will match, i.e. distance is below sensitivity 0425 0426 /// assume current entry will match in no clique 0427 bool foundClique = false; 0428 0429 /// go through all existing cliques 0430 for (QVector<EntryClique *>::Iterator cit = entryCliqueList.begin(); cit != entryCliqueList.end(); ++cit) { 0431 /// check distance between current entry and clique's first entry 0432 if (d->entryDistance(entry.data(), (*cit)->entryList().constFirst().data()) < d->sensitivity) { 0433 /// if distance is below sensitivity, add current entry to clique 0434 foundClique = true; 0435 (*cit)->addEntry(entry); 0436 break; 0437 } 0438 0439 QApplication::instance()->processEvents(); 0440 if (progressDlg->wasCanceled()) { 0441 entryCliqueList.clear(); 0442 break; 0443 } 0444 } 0445 0446 if (!progressDlg->wasCanceled() && !foundClique) { 0447 /// no clique matched to current entry, so create and add new clique 0448 /// consisting only of the current entry 0449 EntryClique *newClique = new EntryClique(); 0450 newClique->addEntry(entry); 0451 entryCliqueList << newClique; 0452 } 0453 0454 curProgress += progressDelta; 0455 ++progressDelta; 0456 progressDlg->setValue(curProgress); 0457 0458 Q_EMIT currentProgress(curProgress); 0459 } 0460 0461 progressDlg->setValue(progressDlg->maximum()); 0462 0463 /// remove cliques with only one element (nothing to merge here) from the list of cliques 0464 for (QVector<EntryClique *>::Iterator cit = entryCliqueList.begin(); cit != entryCliqueList.end();) 0465 if ((*cit)->entryCount() < 2) { 0466 EntryClique *ec = *cit; 0467 cit = entryCliqueList.erase(cit); 0468 delete ec; 0469 } else { 0470 /// entries have been inserted as checked, 0471 /// therefore recalculate alternatives 0472 (*cit)->recalculateValueMap(); 0473 0474 ++cit; 0475 } 0476 0477 QApplication::restoreOverrideCursor(); 0478 return progressDlg->wasCanceled(); 0479 } 0480 0481 0482 MergeDuplicates::MergeDuplicates() 0483 { 0484 /// nothing 0485 } 0486 0487 bool MergeDuplicates::mergeDuplicateEntries(const QVector<EntryClique *> &entryCliques, FileModel *fileModel) 0488 { 0489 bool didMerge = false; 0490 0491 for (EntryClique *entryClique : entryCliques) { 0492 /// Avoid adding fields 20 lines below 0493 /// which have been remove (not added) 10 lines below 0494 QSet<QString> coveredFields; 0495 0496 Entry *mergedEntry = new Entry(QString(), QString()); 0497 const auto fieldList = entryClique->fieldList(); 0498 coveredFields.reserve(fieldList.size()); 0499 for (const auto &field : fieldList) { 0500 coveredFields << field; 0501 if (field == QStringLiteral("^id")) 0502 mergedEntry->setId(PlainTextValue::text(entryClique->chosenValue(field))); 0503 else if (field == QStringLiteral("^type")) 0504 mergedEntry->setType(PlainTextValue::text(entryClique->chosenValue(field))); 0505 else { 0506 Value combined; 0507 const auto chosenValues = entryClique->chosenValues(field); 0508 for (const Value &v : chosenValues) { 0509 combined.append(v); 0510 } 0511 if (!combined.isEmpty()) 0512 mergedEntry->insert(field, combined); 0513 } 0514 } 0515 0516 bool actuallyMerged = false; 0517 int preferredInsertionRow = -1; 0518 const auto entryList = entryClique->entryList(); 0519 for (const auto &entry : entryList) { 0520 /// if merging entries with identical ids, the merged entry will not yet have an id (is null) 0521 if (mergedEntry->id().isEmpty()) 0522 mergedEntry->setId(entry->id()); 0523 /// if merging entries with identical types, the merged entry will not yet have an type (is null) 0524 if (mergedEntry->type().isEmpty()) 0525 mergedEntry->setType(entry->type()); 0526 0527 /// add all other fields not covered by user selection 0528 /// those fields did only occur in one entry (no conflict) 0529 /// may add a lot of bloat to merged entry 0530 if (entryClique->isEntryChecked(entry)) { 0531 actuallyMerged = true; 0532 for (Entry::ConstIterator it = entry->constBegin(); it != entry->constEnd(); ++it) 0533 if (!mergedEntry->contains(it.key()) && !coveredFields.contains(it.key())) { 0534 mergedEntry->insert(it.key(), it.value()); 0535 coveredFields << it.key(); 0536 } 0537 const int row = fileModel->row(entry); 0538 if (preferredInsertionRow < 0) preferredInsertionRow = row; 0539 fileModel->removeRow(row); 0540 } 0541 } 0542 0543 if (actuallyMerged) { 0544 if (preferredInsertionRow < 0) preferredInsertionRow = fileModel->rowCount(); 0545 fileModel->insertRow(QSharedPointer<Entry>(mergedEntry), preferredInsertionRow); 0546 } else 0547 delete mergedEntry; 0548 didMerge |= actuallyMerged; 0549 } 0550 0551 return didMerge; 0552 }