File indexing completed on 2024-04-28 05:08:18
0001 /*************************************************************************** 0002 Copyright (C) 2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "entrycomparison.h" 0026 #include "entry.h" 0027 #include "field.h" 0028 #include "fieldformat.h" 0029 #include "collection.h" 0030 #include "utils/isbnvalidator.h" 0031 #include "utils/lccnvalidator.h" 0032 0033 using Tellico::EntryComparison; 0034 0035 QUrl EntryComparison::s_documentUrl; 0036 0037 void EntryComparison::setDocumentUrl(const QUrl& url_) { 0038 s_documentUrl = url_; 0039 } 0040 0041 int EntryComparison::score(const Tellico::Data::EntryPtr& e1, const Tellico::Data::EntryPtr& e2, 0042 const QString& f, const Tellico::Data::Collection* c) { 0043 return score(e1, e2, c->fieldByName(f)); 0044 } 0045 0046 int EntryComparison::score(const Tellico::Data::EntryPtr& e1, const Tellico::Data::EntryPtr& e2, Tellico::Data::FieldPtr f) { 0047 if(!e1 || !e2 || !f) { 0048 return MATCH_VALUE_NONE; 0049 } 0050 QString s1 = e1->field(f); 0051 if(s1.isEmpty()) { 0052 return MATCH_VALUE_NONE; 0053 } 0054 QString s2 = e2->field(f); 0055 if(s2.isEmpty()) { 0056 return MATCH_VALUE_NONE; 0057 } 0058 // complicated string matching, here are the cases I want to match 0059 // "bend it like beckham" == "bend it like beckham (widescreen edition)" 0060 // "the return of the king" == "return of the king" 0061 if(s1 == s2) { 0062 return MATCH_VALUE_STRONG; 0063 } 0064 // special case for isbn 0065 if(f->name() == QStringLiteral("isbn")) { 0066 return ISBNValidator::isbn10(s1) == ISBNValidator::isbn10(s2) ? MATCH_VALUE_STRONG : MATCH_VALUE_NONE; 0067 } 0068 if(f->name() == QStringLiteral("lccn")) { 0069 return LCCNValidator::formalize(s1) == LCCNValidator::formalize(s2) ? MATCH_VALUE_STRONG : MATCH_VALUE_NONE; 0070 } 0071 if(f->name() == QStringLiteral("url") && e1->collection() && e1->collection()->type() == Data::Collection::File) { 0072 // versions before 1.2.7 could have saved the url without the protocol 0073 QUrl u1(s1); 0074 QUrl u2(s2); 0075 return (u1 == u2 || 0076 (f->property(QStringLiteral("relative")) == QStringLiteral("true") && 0077 s_documentUrl.resolved(u1) == s_documentUrl.resolved(u2))) ? MATCH_VALUE_STRONG : MATCH_VALUE_NONE; 0078 } 0079 if(f->name() == QStringLiteral("imdb")) { 0080 // imdb might be a different host since we query akas.imdb.com and normally it is www.imdb.com 0081 QUrl us1 = QUrl::fromUserInput(s1); 0082 QUrl us2 = QUrl::fromUserInput(s2); 0083 us1.setHost(QString()); 0084 us2.setHost(QString()); 0085 return (us1 == us2) ? MATCH_VALUE_STRONG : MATCH_VALUE_BAD; 0086 } 0087 if(f->formatType() == FieldFormat::FormatName) { 0088 const QString s1n = e1->formattedField(f, FieldFormat::ForceFormat); 0089 const QString s2n = e2->formattedField(f, FieldFormat::ForceFormat); 0090 if(s1n == s2n) { 0091 // let this one fall through if no match, without returning 0 0092 return MATCH_VALUE_STRONG; 0093 } 0094 } 0095 // now do case-insensitive comparison 0096 if(s1.compare(s2, Qt::CaseInsensitive) == 0) { 0097 return MATCH_VALUE_STRONG; 0098 } 0099 0100 if(f->formatType() == FieldFormat::FormatTitle) { 0101 const QString s1t = e1->formattedField(f, FieldFormat::ForceFormat); 0102 const QString s2t = e2->formattedField(f, FieldFormat::ForceFormat); 0103 if(s1t.compare(s2t, Qt::CaseInsensitive) == 0) { 0104 // let this one fall through if no match, without returning 0 0105 return MATCH_VALUE_WEAK; 0106 } 0107 } 0108 if(f->hasFlag(Data::Field::AllowMultiple)) { 0109 QStringList sl1 = FieldFormat::splitValue(e1->field(f)); 0110 QStringList sl2 = FieldFormat::splitValue(e2->field(f)); 0111 int matches = 0; 0112 for(QStringList::ConstIterator it = sl1.constBegin(); it != sl1.constEnd(); ++it) { 0113 matches += MATCH_VALUE_STRONG*sl2.count(*it); 0114 } 0115 if(matches == 0 && f->formatType() == FieldFormat::FormatName) { 0116 sl1 = FieldFormat::splitValue(e1->formattedField(f, FieldFormat::ForceFormat)); 0117 sl2 = FieldFormat::splitValue(e2->formattedField(f, FieldFormat::ForceFormat)); 0118 for(QStringList::ConstIterator it = sl1.constBegin(); it != sl1.constEnd(); ++it) { 0119 matches += MATCH_VALUE_STRONG*sl2.count(*it); 0120 } 0121 } 0122 return matches / sl1.count(); 0123 } 0124 if(f->name() == QStringLiteral("arxiv")) { 0125 // normalize and unVersion arxiv ID 0126 static const QRegularExpression rx1(QStringLiteral("^arxiv:")); 0127 static const QRegularExpression rx2(QStringLiteral("v\\d+$")); 0128 s1.remove(rx1); 0129 s1.remove(rx2); 0130 s2.remove(rx1); 0131 s2.remove(rx2); 0132 return (s1 == s2) ? MATCH_VALUE_STRONG : MATCH_VALUE_BAD; 0133 } 0134 0135 // last resort try removing punctuation 0136 static const QRegularExpression notAlphaNum(QStringLiteral("[^\\s\\w]")); 0137 QString s1a = s1; 0138 s1a.remove(notAlphaNum); 0139 QString s2a = s2; 0140 s2a.remove(notAlphaNum); 0141 if(!s1a.isEmpty() && s1a.compare(s2a, Qt::CaseInsensitive) == 0) { 0142 return MATCH_VALUE_STRONG; 0143 } 0144 return MATCH_VALUE_BAD; 0145 }