File indexing completed on 2024-04-28 05:08:18

0001 /***************************************************************************
0002     Copyright (C) 2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "entrycomparison.h"
0026 #include "entry.h"
0027 #include "field.h"
0028 #include "fieldformat.h"
0029 #include "collection.h"
0030 #include "utils/isbnvalidator.h"
0031 #include "utils/lccnvalidator.h"
0032 
0033 using Tellico::EntryComparison;
0034 
0035 QUrl EntryComparison::s_documentUrl;
0036 
0037 void EntryComparison::setDocumentUrl(const QUrl& url_) {
0038   s_documentUrl = url_;
0039 }
0040 
0041 int EntryComparison::score(const Tellico::Data::EntryPtr& e1, const Tellico::Data::EntryPtr& e2,
0042                            const QString& f, const Tellico::Data::Collection* c) {
0043   return score(e1, e2, c->fieldByName(f));
0044 }
0045 
0046 int EntryComparison::score(const Tellico::Data::EntryPtr& e1, const Tellico::Data::EntryPtr& e2, Tellico::Data::FieldPtr f) {
0047   if(!e1 || !e2 || !f) {
0048     return MATCH_VALUE_NONE;
0049   }
0050   QString s1 = e1->field(f);
0051   if(s1.isEmpty()) {
0052     return MATCH_VALUE_NONE;
0053   }
0054   QString s2 = e2->field(f);
0055   if(s2.isEmpty()) {
0056     return MATCH_VALUE_NONE;
0057   }
0058   // complicated string matching, here are the cases I want to match
0059   // "bend it like beckham" == "bend it like beckham (widescreen edition)"
0060   // "the return of the king" == "return of the king"
0061   if(s1 == s2) {
0062     return MATCH_VALUE_STRONG;
0063   }
0064   // special case for isbn
0065   if(f->name() == QStringLiteral("isbn")) {
0066     return ISBNValidator::isbn10(s1) == ISBNValidator::isbn10(s2) ? MATCH_VALUE_STRONG : MATCH_VALUE_NONE;
0067   }
0068   if(f->name() == QStringLiteral("lccn")) {
0069     return LCCNValidator::formalize(s1) == LCCNValidator::formalize(s2) ? MATCH_VALUE_STRONG : MATCH_VALUE_NONE;
0070   }
0071   if(f->name() == QStringLiteral("url") && e1->collection() && e1->collection()->type() == Data::Collection::File) {
0072     // versions before 1.2.7 could have saved the url without the protocol
0073     QUrl u1(s1);
0074     QUrl u2(s2);
0075     return (u1 == u2 ||
0076             (f->property(QStringLiteral("relative")) == QStringLiteral("true") &&
0077              s_documentUrl.resolved(u1) == s_documentUrl.resolved(u2))) ? MATCH_VALUE_STRONG : MATCH_VALUE_NONE;
0078   }
0079   if(f->name() == QStringLiteral("imdb")) {
0080     // imdb might be a different host since we query akas.imdb.com and normally it is www.imdb.com
0081     QUrl us1 = QUrl::fromUserInput(s1);
0082     QUrl us2 = QUrl::fromUserInput(s2);
0083     us1.setHost(QString());
0084     us2.setHost(QString());
0085     return (us1 == us2) ? MATCH_VALUE_STRONG : MATCH_VALUE_BAD;
0086   }
0087   if(f->formatType() == FieldFormat::FormatName) {
0088     const QString s1n = e1->formattedField(f, FieldFormat::ForceFormat);
0089     const QString s2n = e2->formattedField(f, FieldFormat::ForceFormat);
0090     if(s1n == s2n) {
0091       // let this one fall through if no match, without returning 0
0092       return MATCH_VALUE_STRONG;
0093     }
0094   }
0095   // now do case-insensitive comparison
0096   if(s1.compare(s2, Qt::CaseInsensitive) == 0) {
0097     return MATCH_VALUE_STRONG;
0098   }
0099 
0100   if(f->formatType() == FieldFormat::FormatTitle) {
0101     const QString s1t = e1->formattedField(f, FieldFormat::ForceFormat);
0102     const QString s2t = e2->formattedField(f, FieldFormat::ForceFormat);
0103     if(s1t.compare(s2t, Qt::CaseInsensitive) == 0) {
0104       // let this one fall through if no match, without returning 0
0105       return MATCH_VALUE_WEAK;
0106     }
0107   }
0108   if(f->hasFlag(Data::Field::AllowMultiple)) {
0109     QStringList sl1 = FieldFormat::splitValue(e1->field(f));
0110     QStringList sl2 = FieldFormat::splitValue(e2->field(f));
0111     int matches = 0;
0112     for(QStringList::ConstIterator it = sl1.constBegin(); it != sl1.constEnd(); ++it) {
0113       matches += MATCH_VALUE_STRONG*sl2.count(*it);
0114     }
0115     if(matches == 0 && f->formatType() == FieldFormat::FormatName) {
0116       sl1 = FieldFormat::splitValue(e1->formattedField(f, FieldFormat::ForceFormat));
0117       sl2 = FieldFormat::splitValue(e2->formattedField(f, FieldFormat::ForceFormat));
0118       for(QStringList::ConstIterator it = sl1.constBegin(); it != sl1.constEnd(); ++it) {
0119         matches += MATCH_VALUE_STRONG*sl2.count(*it);
0120       }
0121     }
0122     return matches / sl1.count();
0123   }
0124   if(f->name() == QStringLiteral("arxiv")) {
0125     // normalize and unVersion arxiv ID
0126     static const QRegularExpression rx1(QStringLiteral("^arxiv:"));
0127     static const QRegularExpression rx2(QStringLiteral("v\\d+$"));
0128     s1.remove(rx1);
0129     s1.remove(rx2);
0130     s2.remove(rx1);
0131     s2.remove(rx2);
0132     return (s1 == s2) ? MATCH_VALUE_STRONG : MATCH_VALUE_BAD;
0133   }
0134 
0135   // last resort try removing punctuation
0136   static const QRegularExpression notAlphaNum(QStringLiteral("[^\\s\\w]"));
0137   QString s1a = s1;
0138   s1a.remove(notAlphaNum);
0139   QString s2a = s2;
0140   s2a.remove(notAlphaNum);
0141   if(!s1a.isEmpty() && s1a.compare(s2a, Qt::CaseInsensitive) == 0) {
0142     return MATCH_VALUE_STRONG;
0143   }
0144   return MATCH_VALUE_BAD;
0145 }