File indexing completed on 2024-05-19 16:18:48

0001 /***************************************************************************
0002     Copyright (C) 2004-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "imdbfetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../collections/videocollection.h"
0028 #include "../entry.h"
0029 #include "../field.h"
0030 #include "../fieldformat.h"
0031 #include "../images/imagefactory.h"
0032 #include "../utils/string_utils.h"
0033 #include "../tellico_debug.h"
0034 
0035 #include <KLocalizedString>
0036 #include <KConfigGroup>
0037 #include <KIO/Job>
0038 #include <KJobUiDelegate>
0039 #include <KAcceleratorManager>
0040 #include <KJobWidgets/KJobWidgets>
0041 
0042 #include <QSpinBox>
0043 #include <QRegExp>
0044 #include <QFile>
0045 #include <QMap>
0046 #include <QLabel>
0047 #include <QCheckBox>
0048 #include <QGroupBox>
0049 #include <QGridLayout>
0050 #include <QUrlQuery>
0051 #include <QJsonDocument>
0052 #include <QJsonParseError>
0053 #include <QJsonObject>
0054 #include <QRegularExpression>
0055 
0056 namespace {
0057   static const uint IMDB_MAX_RESULTS = 20;
0058   static const uint IMDB_DEFAULT_CAST_SIZE = 10;
0059   static const int IMDB_MAX_PERSON_COUNT = 5; // limit number of directors, writers, etc, esp for TV series
0060   static const int IMDB_MAX_SEASON_COUNT = 5; // simply takes too long otherwise
0061 }
0062 
0063 using namespace Tellico;
0064 using Tellico::Fetch::IMDBFetcher;
0065 
0066 QRegExp* IMDBFetcher::s_tagRx = nullptr;
0067 QRegExp* IMDBFetcher::s_anchorRx = nullptr;
0068 QRegExp* IMDBFetcher::s_anchorTitleRx = nullptr;
0069 QRegExp* IMDBFetcher::s_anchorNameRx = nullptr;
0070 QRegExp* IMDBFetcher::s_titleRx = nullptr;
0071 const QRegularExpression* IMDBFetcher::s_titleIdRx = nullptr;
0072 int IMDBFetcher::s_instanceCount = 0;
0073 
0074 // static
0075 void IMDBFetcher::initRegExps() {
0076   s_tagRx = new QRegExp(QStringLiteral("<.*>"));
0077   s_tagRx->setMinimal(true);
0078 
0079   s_anchorRx = new QRegExp(QStringLiteral("<a\\s+[^>]*href\\s*=\\s*\"([^\"]+)\"[^<]*>([^<]+)</a>"), Qt::CaseInsensitive);
0080   s_anchorRx->setMinimal(true);
0081 
0082   s_anchorTitleRx = new QRegExp(QStringLiteral("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*/title/[^\"]*)\"[^<]*>([^<]*)</a>"), Qt::CaseInsensitive);
0083   s_anchorTitleRx->setMinimal(true);
0084 
0085   s_anchorNameRx = new QRegExp(QStringLiteral("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*/name/[^\"]*)\"[^<]*>(.+)</a>"), Qt::CaseInsensitive);
0086   s_anchorNameRx->setMinimal(true);
0087 
0088   s_titleRx = new QRegExp(QStringLiteral("<title>(.*)</title>"), Qt::CaseInsensitive);
0089   s_titleRx->setMinimal(true);
0090 
0091   s_titleIdRx = new QRegularExpression(QStringLiteral("title/(tt\\d+)"));
0092 }
0093 
0094 void IMDBFetcher::deleteRegExps() {
0095   delete s_tagRx;
0096   s_tagRx = nullptr;
0097 
0098   delete s_anchorRx;
0099   s_anchorRx = nullptr;
0100 
0101   delete s_anchorTitleRx;
0102   s_anchorTitleRx = nullptr;
0103 
0104   delete s_anchorNameRx;
0105   s_anchorNameRx = nullptr;
0106 
0107   delete s_titleRx;
0108   s_titleRx = nullptr;
0109 
0110   delete s_titleIdRx;
0111   s_titleIdRx = nullptr;
0112 }
0113 
0114 // static
0115 const IMDBFetcher::LangData& IMDBFetcher::langData(int lang_) {
0116   Q_ASSERT(lang_ >= 0);
0117   Q_ASSERT(lang_ <  6);
0118   static LangData dataVector[6] = {
0119     {
0120       i18n("Internet Movie Database"),
0121       QStringLiteral("findSectionHeader"),
0122       QStringLiteral("Exact Matches"),
0123       QStringLiteral("Partial Matches"),
0124       QStringLiteral("Approx Matches"),
0125       QStringLiteral("findSectionHeader"),
0126       QStringLiteral("Other Results"),
0127       QStringLiteral("aka"),
0128       QStringLiteral("Directed by"),
0129       QStringLiteral("Written by"),
0130       QStringLiteral("Produced by"),
0131       QStringLiteral("runtime.*(\\d+)\\s+min"),
0132       QStringLiteral("aspect ratio"),
0133       QStringLiteral("also known as"),
0134       QStringLiteral("Production Co"),
0135       QStringLiteral("cast"),
0136       QStringLiteral("cast overview"),
0137       QStringLiteral("credited cast"),
0138       QStringLiteral("episodes"),
0139       QStringLiteral("Genre"),
0140       QStringLiteral("Sound"),
0141       QStringLiteral("Color"),
0142       QStringLiteral("Language"),
0143       QStringLiteral("Certification"),
0144       QStringLiteral("Country"),
0145       QStringLiteral("plot\\s+(outline|summary)(?!/)"),
0146       QStringLiteral("Music by")
0147     }, {
0148       i18n("Internet Movie Database (French)"),
0149       QStringLiteral("findSectionHeader"),
0150       QStringLiteral("Résultats Exacts"),
0151       QStringLiteral("Résultats Partiels"),
0152       QStringLiteral("Résultats Approximatif"),
0153       QStringLiteral("findSectionHeader"),
0154       QStringLiteral("Résultats Autres"),
0155       QStringLiteral("autre titre"),
0156       QStringLiteral("Réalisateur"),
0157       QStringLiteral("Scénarist"),
0158       QString(),
0159       QStringLiteral("Durée.*(\\d+)\\s+heur.*\\s+(\\d+)\\s+min"),
0160       QStringLiteral("Proportions de l’image"),
0161       QStringLiteral("Alias"),
0162       QStringLiteral("Sociétés de production"),
0163       QStringLiteral("Ensemble"),
0164       QStringLiteral("cast overview"), // couldn't get phrase
0165       QStringLiteral("credited cast"), // couldn't get phrase
0166       QStringLiteral("episodes"),
0167       QStringLiteral("Genre"),
0168       QStringLiteral("Mixage audio"),
0169       QStringLiteral("Couleur"),
0170       QStringLiteral("Langue"),
0171       QStringLiteral("Classification"),
0172       QStringLiteral("Pays d’origine"),
0173       QStringLiteral("Intrigue\\s*"),
0174       QString() // reference page doesn't seem to have localized composer
0175     }, {
0176       i18n("Internet Movie Database (Spanish)"),
0177       QStringLiteral("findSectionHeader"),
0178       QStringLiteral("Resultados Exactos"),
0179       QStringLiteral("Resultados Parciales"),
0180       QStringLiteral("Resultados Aproximados"),
0181       QStringLiteral("findSectionHeader"),
0182       QStringLiteral("Resultados Otros"),
0183       QStringLiteral("otro título"),
0184       QStringLiteral("Director"),
0185       QStringLiteral("Escritores"),
0186       QString(),
0187       QStringLiteral("Duración.*(\\d+)\\s+min"),
0188       QStringLiteral("Relación de Aspecto"),
0189       QStringLiteral("Conocido como"),
0190       QStringLiteral("Compañías Productores"),
0191       QStringLiteral("Reparto"),
0192       QStringLiteral("cast overview"), // couldn't get phrase
0193       QStringLiteral("credited cast"), // couldn't get phrase
0194       QStringLiteral("episodes"),
0195       QStringLiteral("Género"),
0196       QStringLiteral("Sonido"),
0197       QStringLiteral("Color"),
0198       QStringLiteral("Idioma"),
0199       QStringLiteral("Clasificación"),
0200       QStringLiteral("País"),
0201       QStringLiteral("Trama\\s*"),
0202       QString() // reference page doesn't seem to have localized composer
0203     }, {
0204       i18n("Internet Movie Database (German)"),
0205       QStringLiteral("findSectionHeader"),
0206       QStringLiteral("genaue Übereinstimmung"),
0207       QStringLiteral("teilweise Übereinstimmung"),
0208       QStringLiteral("näherungsweise Übereinstimmung"),
0209       QStringLiteral("findSectionHeader"),
0210       QStringLiteral("andere Übereinstimmung"),
0211       QStringLiteral("andere titel"),
0212       QStringLiteral("Regisseur"),
0213       QStringLiteral("Drehbuchautoren"),
0214       QString(),
0215       QStringLiteral("Länge.*(\\d+)\\s+min"),
0216       QStringLiteral("Seitenverhältnis"),
0217       QStringLiteral("Auch bekannt als"),
0218       QStringLiteral("Produktionsfirmen"),
0219       QStringLiteral("Besetzung"),
0220       QStringLiteral("cast overview"), // couldn't get phrase
0221       QStringLiteral("credited cast"), // couldn't get phrase
0222       QStringLiteral("episodes"),
0223       QStringLiteral("Genre"),
0224       QStringLiteral("Tonverfahren"),
0225       QStringLiteral("Farbe"),
0226       QStringLiteral("Sprache"),
0227       QStringLiteral("Altersfreigabe"),
0228       QStringLiteral("Land"),
0229       QStringLiteral("Handlung\\s*"),
0230       QString() // reference page doesn't seem to have localized composer
0231     }, {
0232       i18n("Internet Movie Database (Italian)"),
0233       QStringLiteral("findSectionHeader"),
0234       QStringLiteral("risultati esatti"),
0235       QStringLiteral("risultati parziali"),
0236       QStringLiteral("risultati approssimati"),
0237       QStringLiteral("findSectionHeader"),
0238       QStringLiteral("Resultados Otros"),
0239       QStringLiteral("otro título"),
0240       QStringLiteral("Regista"),
0241       QStringLiteral("Sceneggiatori"),
0242       QString(),
0243       QStringLiteral("Durata.*(\\d+)\\s+min"),
0244       QStringLiteral("Aspect Ratio"),
0245       QStringLiteral("Alias"),
0246       QStringLiteral("Società di produzione"),
0247       QStringLiteral("Cast"),
0248       QStringLiteral("cast overview"), // couldn't get phrase
0249       QStringLiteral("credited cast"), // couldn't get phrase
0250       QStringLiteral("episodes"),
0251       QStringLiteral("Genere"),
0252       QStringLiteral("Sonoro"),
0253       QStringLiteral("Colore"),
0254       QStringLiteral("Lingua"),
0255       QStringLiteral("Divieti"),
0256       QStringLiteral("Nazionalità"),
0257       QStringLiteral("Trama\\s*"),
0258       QString() // reference page doesn't seem to have localized composer
0259     }, {
0260       i18n("Internet Movie Database (Portuguese)"),
0261       QStringLiteral("findSectionHeader"),
0262       QStringLiteral("Exato"),
0263       QStringLiteral("Combinação Parcial"),
0264       QStringLiteral("Combinação Aproximada"),
0265       QStringLiteral("findSectionHeader"),
0266       QStringLiteral("Combinação Otros"),
0267       QStringLiteral("otro título"),
0268       QStringLiteral("Diretor"),
0269       QStringLiteral("Escritores"),
0270       QString(),
0271       QStringLiteral("Duração.*(\\d+)\\s+min"),
0272       QStringLiteral("Resolução"),
0273       QStringLiteral("Também Conhecido Como"),
0274       QStringLiteral("Companhias de Produção"),
0275       QStringLiteral("Elenco"),
0276       QStringLiteral("cast overview"), // couldn't get phrase
0277       QStringLiteral("credited cast"), // couldn't get phrase
0278       QStringLiteral("episodes"),
0279       QStringLiteral("Gênero"),
0280       QStringLiteral("Mixagem de Som"),
0281       QStringLiteral("Cor"),
0282       QStringLiteral("Lingua"),
0283       QStringLiteral("Certificação"),
0284       QStringLiteral("País"),
0285       QStringLiteral("Argumento\\s*"),
0286       QString() // reference page doesn't seem to have localized composer
0287     }
0288   };
0289 
0290   return dataVector[qBound(0, lang_, static_cast<int>(sizeof(dataVector)/sizeof(LangData)))];
0291 }
0292 
0293 IMDBFetcher::IMDBFetcher(QObject* parent_) : Fetcher(parent_),
0294     m_job(nullptr), m_started(false), m_fetchImages(true),
0295     m_numCast(IMDB_DEFAULT_CAST_SIZE), m_redirected(false), m_limit(IMDB_MAX_RESULTS), m_lang(EN),
0296     m_currentTitleBlock(Unknown), m_countOffset(0) {
0297   if(!s_instanceCount++) {
0298     initRegExps();
0299   }
0300   m_host = QStringLiteral("www.imdb.com");
0301 }
0302 
0303 IMDBFetcher::~IMDBFetcher() {
0304   if(!--s_instanceCount) {
0305     deleteRegExps();
0306   }
0307 }
0308 
0309 QString IMDBFetcher::source() const {
0310   return m_name.isEmpty() ? defaultName() : m_name;
0311 }
0312 
0313 bool IMDBFetcher::canFetch(int type) const {
0314   return type == Data::Collection::Video;
0315 }
0316 
0317 // imdb can search title only
0318 bool IMDBFetcher::canSearch(Fetch::FetchKey k) const {
0319   return k == Title;
0320 }
0321 
0322 void IMDBFetcher::readConfigHook(const KConfigGroup& config_) {
0323   const int lang = config_.readEntry("Lang", int(EN));
0324   m_lang = static_cast<Lang>(lang);
0325   if(m_name.isEmpty()) {
0326     m_name = langData(m_lang).siteTitle;
0327   }
0328 
0329   m_numCast = config_.readEntry("Max Cast", IMDB_DEFAULT_CAST_SIZE);
0330   m_fetchImages = config_.readEntry("Fetch Images", true);
0331 }
0332 
0333 // multiple values not supported
0334 void IMDBFetcher::search() {
0335   m_started = true;
0336   m_redirected = false;
0337 
0338   m_matches.clear();
0339   m_popularTitles.clear();
0340   m_exactTitles.clear();
0341   m_partialTitles.clear();
0342   m_currentTitleBlock = Unknown;
0343   m_countOffset = 0;
0344 
0345   m_url = QUrl();
0346   m_url.setScheme(QStringLiteral("https"));
0347   m_url.setHost(m_host);
0348   m_url.setPath(QStringLiteral("/find/"));
0349 
0350   // as far as I can tell, the url encoding should always be iso-8859-1?
0351   QUrlQuery q;
0352   q.addQueryItem(QStringLiteral("q"), request().value());
0353 
0354   switch(request().key()) {
0355     case Title:
0356       q.addQueryItem(QStringLiteral("s"), QStringLiteral("tt"));
0357       m_url.setQuery(q);
0358       break;
0359 
0360     case Raw:
0361       m_url = QUrl(request().value());
0362       break;
0363 
0364     default:
0365       myWarning() << "not supported:" << request().key();
0366       stop();
0367       return;
0368   }
0369 //  myDebug() << m_url;
0370 
0371   m_job = KIO::storedGet(m_url, KIO::NoReload, KIO::HideProgressInfo);
0372   configureJob(m_job);
0373   connect(m_job.data(), &KJob::result,
0374           this, &IMDBFetcher::slotComplete);
0375   connect(m_job.data(), &KIO::TransferJob::redirection,
0376           this, &IMDBFetcher::slotRedirection);
0377 }
0378 
0379 void IMDBFetcher::continueSearch() {
0380   m_started = true;
0381   m_limit += IMDB_MAX_RESULTS;
0382 
0383   if(m_currentTitleBlock == Popular) {
0384     parseTitleBlock(m_popularTitles);
0385     // if the offset is 0, then we need to be looking at the next block
0386     m_currentTitleBlock = m_countOffset == 0 ? Exact : Popular;
0387   }
0388 
0389   // current title block might have changed
0390   if(m_currentTitleBlock == Exact) {
0391     parseTitleBlock(m_exactTitles);
0392     m_currentTitleBlock = m_countOffset == 0 ? Partial : Exact;
0393   }
0394 
0395   if(m_currentTitleBlock == Partial) {
0396     parseTitleBlock(m_partialTitles);
0397     m_currentTitleBlock = m_countOffset == 0 ? Approx : Partial;
0398   }
0399 
0400   if(m_currentTitleBlock == Approx) {
0401     parseTitleBlock(m_approxTitles);
0402     m_currentTitleBlock = m_countOffset == 0 ? Unknown : Approx;
0403   }
0404 
0405   m_hasMoreResults = false;
0406   stop();
0407 }
0408 
0409 void IMDBFetcher::stop() {
0410   if(!m_started) {
0411     return;
0412   }
0413   if(m_job) {
0414     m_job->kill();
0415     m_job = nullptr;
0416   }
0417 
0418   m_started = false;
0419   m_redirected = false;
0420 
0421   emit signalDone(this);
0422 }
0423 
0424 void IMDBFetcher::slotRedirection(KIO::Job*, const QUrl& toURL_) {
0425   static const QRegularExpression ttEndRx(QStringLiteral("/tt\\d+/$"));
0426   m_url = toURL_;
0427   if(m_url.path().contains(ttEndRx)) {
0428     m_url.setPath(m_url.path() + QStringLiteral("reference"));
0429   }
0430   m_redirected = true;
0431 }
0432 
0433 void IMDBFetcher::slotComplete(KJob*) {
0434   if(m_job->error()) {
0435     myDebug() << m_job->errorString();
0436     m_job->uiDelegate()->showErrorMessage();
0437     stop();
0438     return;
0439   }
0440 
0441   m_text = Tellico::fromHtmlData(m_job->data(), "UTF-8");
0442   if(m_text.isEmpty()) {
0443     myLog() << "No data returned";
0444     stop();
0445     return;
0446   }
0447   // see bug 319662. If fetcher is cancelled, job is killed
0448   // if the pointer is retained, it gets double-deleted
0449   m_job = nullptr;
0450 
0451 #if 0
0452   myWarning() << "Remove debug from imdbfetcher.cpp for /tmp/testimdbresults.html";
0453   QFile f(QString::fromLatin1("/tmp/testimdbresults.html"));
0454   if(f.open(QIODevice::WriteOnly)) {
0455     QTextStream t(&f);
0456     t.setCodec("UTF-8");
0457     t << m_text;
0458   }
0459   f.close();
0460 #endif
0461 
0462   // a single result was found if we got redirected
0463   switch(request().key()) {
0464     case Title:
0465       if(m_redirected) {
0466         parseSingleTitleResult();
0467       } else {
0468         parseMultipleTitleResults();
0469       }
0470       break;
0471 
0472     case Raw:
0473       parseSingleTitleResult();
0474       break;
0475 
0476     default:
0477       myWarning() << "skipping results";
0478       break;
0479   }
0480 }
0481 
0482 void IMDBFetcher::parseSingleTitleResult() {
0483   s_titleRx->indexIn(Tellico::decodeHTML(m_text));
0484   // split title at parenthesis
0485   const QString cap1 = s_titleRx->cap(1);
0486   int pPos = cap1.indexOf(QLatin1Char('('));
0487   // FIXME: maybe remove parentheses here?
0488   FetchResult* r = new FetchResult(this,
0489                                    pPos == -1 ? cap1 : cap1.left(pPos),
0490                                    pPos == -1 ? QString() : cap1.mid(pPos));
0491   // IMDB returns different HTML for single title results and has a query in the url
0492   // clear the query so we download the "canonical" page for the title
0493   QUrl url(m_url);
0494   url.setQuery(QString());
0495   m_matches.insert(r->uid, url);
0496   m_allMatches.insert(r->uid, url);
0497   emit signalResultFound(r);
0498 
0499   m_hasMoreResults = false;
0500   stop();
0501 }
0502 
0503 void IMDBFetcher::parseMultipleTitleResults() {
0504   QString output = Tellico::decodeHTML(m_text);
0505 
0506   const LangData& data = langData(m_lang);
0507   // IMDb can return three title lists, popular, exact, and partial
0508   // the popular titles are in the first table
0509   int pos_popular = output.indexOf(data.title_popular, 0,                    Qt::CaseInsensitive);
0510   int pos_exact   = output.indexOf(data.match_exact,   qMax(pos_popular, 0), Qt::CaseInsensitive);
0511   int pos_partial = output.indexOf(data.match_partial, qMax(pos_exact,   0), Qt::CaseInsensitive);
0512   int pos_approx  = output.indexOf(data.match_approx,  qMax(pos_partial, 0), Qt::CaseInsensitive);
0513 
0514   int end_popular = pos_exact; // keep track of where to end
0515   if(end_popular == -1) {
0516     end_popular = pos_partial == -1 ? (pos_approx == -1 ? output.length() : pos_approx) : pos_partial;
0517   }
0518   int end_exact = pos_partial; // keep track of where to end
0519   if(end_exact == -1) {
0520     end_exact = pos_approx == -1 ? output.length() : pos_approx;
0521   }
0522   int end_partial = pos_approx; // keep track of where to end
0523   if(end_partial == -1) {
0524     end_partial = output.length();
0525   }
0526 
0527   // if found popular matches
0528   if(pos_popular > -1) {
0529     m_popularTitles = output.mid(pos_popular, end_popular-pos_popular);
0530   }
0531   // if found exact matches
0532   if(pos_exact > -1) {
0533     m_exactTitles = output.mid(pos_exact, end_exact-pos_exact);
0534   }
0535   if(pos_partial > -1) {
0536     m_partialTitles = output.mid(pos_partial, end_partial-pos_partial);
0537   }
0538   if(pos_approx > -1) {
0539     m_approxTitles = output.mid(pos_approx);
0540   }
0541 
0542   parseTitleBlock(m_popularTitles);
0543   // if the offset is 0, then we need to be looking at the next block
0544   m_currentTitleBlock = m_countOffset == 0 ? Exact : Popular;
0545 
0546   if(m_matches.size() < m_limit) {
0547     parseTitleBlock(m_exactTitles);
0548     m_currentTitleBlock = m_countOffset == 0 ? Partial : Exact;
0549   }
0550 
0551   if(m_matches.size() < m_limit) {
0552     parseTitleBlock(m_partialTitles);
0553     m_currentTitleBlock = m_countOffset == 0 ? Approx : Partial;
0554   }
0555 
0556   if(m_matches.size() < m_limit) {
0557     parseTitleBlock(m_approxTitles);
0558     m_currentTitleBlock = m_countOffset == 0 ? Unknown : Approx;
0559   }
0560 
0561   // last resort
0562   if(m_matches.size() < m_limit) {
0563     const int pos_header = output.indexOf(QStringLiteral("ipc-page-content-container"));
0564     const int end_header = output.indexOf(QStringLiteral("cornerstone"), qMax(0, pos_header));
0565     if(pos_header > -1) {
0566       parseTitleBlock(output.mid(pos_header, end_header == -1 ? output.length() : end_header));
0567     }
0568   }
0569 
0570   if(m_matches.size() == 0) {
0571     myLog() << "no matches found.";
0572   }
0573 
0574   stop();
0575 }
0576 
0577 void IMDBFetcher::parseTitleBlock(const QString& str_) {
0578   if(str_.isEmpty()) {
0579     m_countOffset = 0;
0580     return;
0581   }
0582 
0583   static const QRegularExpression akaRx(QStringLiteral("%1 (.*?)(</li>|</td>|<br)").arg(langData(m_lang).aka),
0584                                         QRegularExpression::CaseInsensitiveOption);
0585   m_hasMoreResults = false;
0586 
0587   int count = 0;
0588   int start = s_anchorTitleRx->indexIn(str_);
0589   while(m_started && start > -1) {
0590     // split title at parenthesis
0591     const QString cap1 = s_anchorTitleRx->cap(1); // the anchor url
0592     const QString cap2 = s_anchorTitleRx->cap(2).trimmed(); // the anchor text
0593     start += s_anchorTitleRx->matchedLength();
0594     int pPos = cap2.indexOf(QLatin1Char('(')); // if it has parentheses, use that for description
0595     QString desc;
0596     if(pPos > -1) {
0597       int pPos2 = cap2.indexOf(QLatin1Char(')'), pPos+1);
0598       if(pPos2 > -1) {
0599         desc = cap2.mid(pPos+1, pPos2-pPos-1);
0600       }
0601     } else {
0602       // parenthesis might be outside anchor tag
0603       int end = s_anchorTitleRx->indexIn(str_, start);
0604       const int end2 = str_.indexOf(QStringLiteral("<img"), start);
0605       const int end3 = str_.indexOf(QStringLiteral("</ul"), start);
0606       if(end2 > -1) end = qMin(end, end2);
0607       if(end3 > -1) end = qMin(end, end3);
0608       if(end == -1) {
0609         end = str_.length();
0610       }
0611       const QString text = str_.mid(start, end-start);
0612       pPos = text.indexOf(QLatin1Char('('));
0613       if(pPos > -1) {
0614         const int pNewLine = text.indexOf(QStringLiteral("<br"));
0615         if(pNewLine == -1 || pPos < pNewLine) {
0616           const int pPos2 = text.indexOf(QLatin1Char(')'), pPos);
0617           desc = text.mid(pPos+1, pPos2-pPos-1);
0618         }
0619         // IMDB occasionally has (I) in results. If so, continue parsing string
0620         if(desc == QStringLiteral("I") || desc == QStringLiteral("II")) {
0621           pPos = text.indexOf(QLatin1Char('('), pPos+1);
0622           if(pPos > -1 && (pNewLine == -1 || pPos < pNewLine)) {
0623             const int pPos2 = text.indexOf(QLatin1Char(')'), pPos);
0624             desc = text.mid(pPos+1, pPos2-pPos-1);
0625           }
0626         }
0627         pPos = -1;
0628       } else {
0629         static const QRegularExpression digitsRx(QStringLiteral(">([-–\\d]+)\\s*<"));
0630         QRegularExpressionMatch digitsMatch = digitsRx.match(text);
0631         if(digitsMatch.hasMatch()) {
0632           desc = digitsMatch.captured(1);
0633         }
0634       }
0635     }
0636     auto akaMatch = akaRx.match(str_, start+1, QRegularExpression::NormalMatch);
0637     if(akaMatch.hasMatch()) {
0638       // limit to 50 chars
0639       desc += QLatin1Char(' ') + akaMatch.captured(1).trimmed().remove(*s_tagRx);
0640       if(desc.length() > 50) {
0641         desc = desc.left(50) + QStringLiteral("...");
0642       }
0643     }
0644 
0645     start = s_anchorTitleRx->indexIn(str_, start);
0646 
0647     if(count < m_countOffset) {
0648       ++count;
0649       continue;
0650     }
0651 
0652     // if we got this far, then there is a valid result
0653     if(m_matches.size() >= m_limit) {
0654       m_hasMoreResults = true;
0655       break;
0656     }
0657 
0658     FetchResult* r = new FetchResult(this, pPos == -1 ? cap2 : cap2.left(pPos), desc);
0659     QUrl u = QUrl(m_url).resolved(QUrl(cap1));
0660     u.setQuery(QString());
0661     m_matches.insert(r->uid, u);
0662     m_allMatches.insert(r->uid, u);
0663     emit signalResultFound(r);
0664     ++count;
0665   }
0666   if(!m_hasMoreResults && m_currentTitleBlock != Partial) {
0667     m_hasMoreResults = true;
0668   }
0669   m_countOffset = m_matches.size() < m_limit ? 0 : count;
0670 }
0671 
0672 Tellico::Data::EntryPtr IMDBFetcher::fetchEntryHook(uint uid_) {
0673   // if we already grabbed this one, then just pull it out of the dict
0674   Data::EntryPtr entry = m_entries[uid_];
0675   if(entry) {
0676     return entry;
0677   }
0678 
0679   if(!m_matches.contains(uid_) && !m_allMatches.contains(uid_)) {
0680     myLog() << "no url found";
0681     return Data::EntryPtr();
0682   }
0683   QUrl url = m_matches.contains(uid_) ? m_matches[uid_]
0684                                       : m_allMatches[uid_];
0685   static const QRegularExpression ttEndRx(QStringLiteral("/tt\\d+/$"));
0686   if(m_lang == EN && url.path().contains(ttEndRx))  {
0687     url.setPath(url.path() + QStringLiteral("reference"));
0688   }
0689 
0690   QUrl origURL = m_url; // keep to switch back
0691   QString results;
0692   // if the url matches the current one, no need to redownload it
0693   if(url == m_url) {
0694     results = Tellico::decodeHTML(m_text);
0695   } else {
0696     // now it's synchronous
0697     // be quiet about failure
0698     QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(url, KIO::NoReload, KIO::HideProgressInfo);
0699     configureJob(getJob);
0700     if(!getJob->exec()) {
0701       myWarning() << "...unable to read" << url;
0702       return Data::EntryPtr();
0703     }
0704     results = Tellico::fromHtmlData(getJob->data(), "UTF-8");
0705     m_url = url; // needed for processing
0706 #if 0
0707     myWarning() << "Remove debug from imdbfetcher.cpp for /tmp/testimdbresult.html";
0708     myDebug() << m_url;
0709     QFile f(QStringLiteral("/tmp/testimdbresult.html"));
0710     if(f.open(QIODevice::WriteOnly)) {
0711       QTextStream t(&f);
0712       t << results;
0713     }
0714     f.close();
0715 #endif
0716     results = Tellico::decodeHTML(results);
0717   }
0718   if(results.isEmpty()) {
0719     myLog() << "no text results";
0720     m_url = origURL;
0721     return Data::EntryPtr();
0722   }
0723 
0724   entry = parseEntry(results);
0725   m_url = origURL;
0726   if(!entry) {
0727     myDebug() << "error in processing entry";
0728     return Data::EntryPtr();
0729   }
0730   m_entries.insert(uid_, entry); // keep for later
0731   return entry;
0732 }
0733 
0734 Tellico::Data::EntryPtr IMDBFetcher::parseEntry(const QString& str_) {
0735   Data::CollPtr coll(new Data::VideoCollection(true));
0736   Data::EntryPtr entry(new Data::Entry(coll));
0737 
0738   doJson(str_, entry);
0739 
0740   doTitle(str_, entry);
0741   doRunningTime(str_, entry);
0742   doAspectRatio(str_, entry);
0743   doAlsoKnownAs(str_, entry);
0744   doPlot(str_, entry, m_url);
0745   if(m_lang == EN) {
0746     doLists(str_, entry);
0747   } else {
0748     doLists2(str_, entry);
0749   }
0750   doStudio(str_, entry);
0751   doPerson(str_, entry, langData(m_lang).director, QStringLiteral("director"));
0752   doPerson(str_, entry, langData(m_lang).writer, QStringLiteral("writer"));
0753   doPerson(str_, entry, langData(m_lang).composer, QStringLiteral("composer"));
0754   doRating(str_, entry);
0755   doCast(str_, entry, m_url);
0756   if(m_fetchImages) {
0757     // needs base URL
0758     doCover(str_, entry, m_url);
0759   }
0760   if(optionalFields().contains(QStringLiteral("episode"))) {
0761     doEpisodes(str_, entry, m_url);
0762   }
0763 
0764   const QString imdb = QStringLiteral("imdb");
0765   if(!coll->hasField(imdb) && optionalFields().contains(imdb)) {
0766     coll->addField(Data::Field::createDefaultField(Data::Field::ImdbField));
0767   }
0768   if(coll->hasField(imdb) && coll->fieldByName(imdb)->type() == Data::Field::URL) {
0769     m_url.setQuery(QString());
0770     // we want to strip the "/reference" from the url
0771     QString url = m_url.url();
0772     if(url.endsWith(QStringLiteral("/reference"))) {
0773       url = m_url.adjusted(QUrl::RemoveFilename).url();
0774     }
0775     entry->setField(imdb, url);
0776   }
0777   return entry;
0778 }
0779 
0780 void IMDBFetcher::doJson(const QString& str_, Tellico::Data::EntryPtr entry_) {
0781   static const QRegularExpression jsonRx(QStringLiteral("<script[^>]+?type=\"application/ld\\+json\".*?>(.+?)</script>"));
0782   QRegularExpressionMatch jsonMatch = jsonRx.match(str_);
0783   if(!jsonMatch.hasMatch()) {
0784     return;
0785   }
0786 
0787   QJsonParseError parseError;
0788   QJsonDocument doc = QJsonDocument::fromJson(jsonMatch.captured(1).toUtf8(), &parseError);
0789   if(doc.isNull()) {
0790     myDebug() << "Bad json data:" << parseError.errorString();
0791     return;
0792   }
0793 
0794   QVariantMap objectMap = doc.object().toVariantMap();
0795   entry_->setField(QStringLiteral("title"), mapValue(objectMap, "name"));
0796   entry_->setField(QStringLiteral("director"), mapValue(objectMap, "director", "name"));
0797   entry_->setField(QStringLiteral("plot"), mapValue(objectMap, "description"));
0798   entry_->setField(QStringLiteral("genre"), mapValue(objectMap, "genre"));
0799 
0800   QStringList writers;
0801   foreach(QVariant v, objectMap.value(QStringLiteral("creator")).toList()) {
0802     auto vmap = v.toMap();
0803     if(vmap.value(QLatin1String("@type")) == QLatin1String("Person")) {
0804       writers += vmap.value(QLatin1String("name")).toString();
0805     }
0806   }
0807   entry_->setField(QStringLiteral("writer"), writers.join(FieldFormat::delimiterString()));
0808 
0809   QString cert = mapValue(objectMap, "contentRating");
0810   // set default certification, assuming US for now
0811   if(cert == QStringLiteral("Unrated")) {
0812     cert = QLatin1Char('U');
0813   }
0814   cert += QStringLiteral(" (USA)");
0815   const QStringList& certsAllowed = entry_->collection()->fieldByName(QStringLiteral("certification"))->allowed();
0816   if(certsAllowed.contains(cert)) {
0817     entry_->setField(QStringLiteral("certification"), cert);
0818   }
0819 
0820   const QString imageUrl = mapValue(objectMap,"image");
0821   if(!imageUrl.isEmpty()) {
0822     QString id = ImageFactory::addImage(QUrl::fromUserInput(imageUrl), true);
0823     if(!id.isEmpty()) {
0824       entry_->setField(QStringLiteral("cover"), id);
0825     }
0826   }
0827 
0828   if(optionalFields().contains(QStringLiteral("imdb-rating"))) {
0829     if(!entry_->collection()->hasField(QStringLiteral("imdb-rating"))) {
0830       Data::FieldPtr f(new Data::Field(QStringLiteral("imdb-rating"), i18n("IMDb Rating"), Data::Field::Rating));
0831       f->setCategory(i18n("General"));
0832       f->setProperty(QStringLiteral("maximum"), QStringLiteral("10"));
0833       entry_->collection()->addField(f);
0834     }
0835 
0836     const QString ratingString = mapValue(objectMap, "aggregateRating", "ratingValue");
0837     bool ok = true;
0838     float value = ratingString.toFloat(&ok);
0839     if(!ok) {
0840       value = QLocale().toFloat(ratingString, &ok);
0841     }
0842     if(ok) {
0843       entry_->setField(QStringLiteral("imdb-rating"), QString::number(value));
0844     }
0845   }
0846 }
0847 
0848 void IMDBFetcher::doTitle(const QString& str_, Tellico::Data::EntryPtr entry_) {
0849   if(s_titleRx->indexIn(str_) > -1) {
0850     const QString cap1 = s_titleRx->cap(1);
0851     // titles always have parentheses
0852     int pPos = cap1.indexOf(QLatin1Char('('));
0853     QString title = cap1.left(pPos).trimmed();
0854     // remove first and last quotes is there
0855     if(title.startsWith(QLatin1Char('"')) && title.endsWith(QLatin1Char('"'))) {
0856       title = title.mid(1, title.length()-2);
0857     }
0858     entry_->setField(QStringLiteral("title"), title);
0859 
0860     // now for movies with original non-english titles, the <title> is english
0861     // but the page header is the original title. Grab the orig title
0862     static const QRegularExpression h3TitleRx(QStringLiteral("<h3[^>]+itemprop=\"name\"\\s*>(.*?)<"),
0863                                               QRegularExpression::DotMatchesEverythingOption);
0864     auto h3Match = h3TitleRx.match(str_);
0865     if(h3Match.hasMatch()) {
0866       QString possibleOrigTitle;
0867       const QString h3Title = h3Match.captured(1).trimmed();
0868       if(h3Title == title) {
0869         // some tv series have a original title label
0870         static const QRegularExpression origTitleRx(QLatin1String("/h3>(.*?)<span class=\"titlereference-original-title-label"),
0871                                                     QRegularExpression::DotMatchesEverythingOption);
0872         auto origTitleMatch = origTitleRx.match(str_);
0873         if(origTitleMatch.hasMatch()) {
0874           possibleOrigTitle = origTitleMatch.captured(1).trimmed();
0875         }
0876       } else {
0877         // mis-matching titles. If the user has requested original title,
0878         // put it in origtitle field and keep english as title
0879         // otherwise replace
0880         if(optionalFields().contains(QStringLiteral("origtitle"))) {
0881           possibleOrigTitle = h3Title;
0882         } else {
0883           entry_->setField(QStringLiteral("title"), h3Title);
0884         }
0885       }
0886       if(!possibleOrigTitle.isEmpty() && optionalFields().contains(QStringLiteral("origtitle"))) {
0887         Data::FieldPtr f(new Data::Field(QStringLiteral("origtitle"), i18n("Original Title")));
0888         f->setFormatType(FieldFormat::FormatTitle);
0889         entry_->collection()->addField(f);
0890         entry_->setField(QStringLiteral("origtitle"), possibleOrigTitle);
0891       }
0892     }
0893 
0894     // remove parentheses and extract year, tv shows can have (TV Series 2002-2003) for example
0895     int pPos2 = pPos+1;
0896     // find the closing parenthesis
0897     while(pPos2 < cap1.length() && cap1[pPos2] != QLatin1Char(')')) {
0898       ++pPos2;
0899     }
0900     const auto inParentheses = cap1.midRef(pPos+1, pPos2-pPos-1);
0901     if(!inParentheses.isEmpty()) {
0902       static const QRegularExpression yearRx(QLatin1String("\\d{4}")); // ignore ending year for tv series
0903       auto match = yearRx.match(inParentheses);
0904       if(match.hasMatch()) {
0905         entry_->setField(QStringLiteral("year"), match.captured());
0906       }
0907     }
0908   }
0909 }
0910 
0911 void IMDBFetcher::doRunningTime(const QString& str_, Tellico::Data::EntryPtr entry_) {
0912   // running time
0913   QRegExp runtimeRx(langData(m_lang).runtime, Qt::CaseInsensitive);
0914   runtimeRx.setMinimal(true);
0915 
0916   QString text = str_;
0917   text.remove(*s_tagRx);
0918   if(runtimeRx.indexIn(text) > -1) {
0919     if(m_lang == EN) {
0920       entry_->setField(QStringLiteral("running-time"), runtimeRx.cap(1));
0921     }
0922     else {
0923       const int hours = runtimeRx.cap(1).toInt();
0924       const int minutes = runtimeRx.cap(2).toInt();
0925       entry_->setField(QStringLiteral("running-time"), QString::number(hours*60+minutes));
0926     }
0927   }
0928 }
0929 
0930 void IMDBFetcher::doAspectRatio(const QString& str_, Tellico::Data::EntryPtr entry_) {
0931   QRegExp rx(QStringLiteral("%1.*([\\d\\.\\,]+\\s*:\\s*[\\d\\.\\,]+)").arg(langData(m_lang).aspect_ratio), Qt::CaseInsensitive);
0932   rx.setMinimal(true);
0933 
0934   if(rx.indexIn(str_) > -1) {
0935     entry_->setField(QStringLiteral("aspect-ratio"), rx.cap(1).trimmed());
0936   }
0937 }
0938 
0939 void IMDBFetcher::doAlsoKnownAs(const QString& str_, Tellico::Data::EntryPtr entry_) {
0940   if(!optionalFields().contains(QStringLiteral("alttitle"))) {
0941     return;
0942   }
0943 
0944   // match until next b tag
0945 //  QRegExp akaRx(QStringLiteral("also known as(.*)<b(?:\\s.*)?>"));
0946   QRegExp akaRx(QStringLiteral("%1(.*)(<a|<span)[>\\s/]").arg(langData(m_lang).also_known_as), Qt::CaseInsensitive);
0947   akaRx.setMinimal(true);
0948 
0949   if(akaRx.indexIn(str_) > -1 && !akaRx.cap(1).isEmpty()) {
0950     Data::FieldPtr f = entry_->collection()->fieldByName(QStringLiteral("alttitle"));
0951     if(!f) {
0952       f = new Data::Field(QStringLiteral("alttitle"), i18n("Alternative Titles"), Data::Field::Table);
0953       f->setFormatType(FieldFormat::FormatTitle);
0954       entry_->collection()->addField(f);
0955     }
0956 
0957     // split by </li>
0958     QStringList list = akaRx.cap(1).split(QStringLiteral("</li>"));
0959     // lang could be included with [fr]
0960 //    const QRegExp parRx(QStringLiteral("\\(.+\\)"));
0961     const QRegExp brackRx(QStringLiteral("\\[\\w+\\]"));
0962     const QRegExp countryRx(QStringLiteral("\\s*\\(.+\\)\\s*$"));
0963     QStringList values;
0964     for(QStringList::Iterator it = list.begin(); it != list.end(); ++it) {
0965       // sometimes the regexp doesn't work and grabs too much text
0966       // limit to reasonable length
0967       QString s = (*it).left(1000);
0968       // sometimes, the word "more" gets linked to the releaseinfo page, check that
0969       if(s.contains(QStringLiteral("releaseinfo"))) {
0970         continue;
0971       }
0972       s.remove(*s_tagRx);
0973       s.remove(brackRx);
0974       // remove country
0975       s.remove(countryRx);
0976       s.remove(QLatin1Char('"'));
0977       s = s.trimmed();
0978       // the first value ends up being or starting with the colon after "Also known as"
0979       // I'm too lazy to figure out a better regexp
0980       if(s.startsWith(QLatin1Char(':'))) {
0981         s = s.mid(1);
0982         s = s.trimmed();
0983       }
0984       if(!s.isEmpty()) {
0985         values += s;
0986       }
0987     }
0988     if(!values.isEmpty()) {
0989       entry_->setField(QStringLiteral("alttitle"), values.join(FieldFormat::rowDelimiterString()));
0990     }
0991 //  } else {
0992 //    myLog() << "'Also Known As' not found";
0993   }
0994 }
0995 
0996 void IMDBFetcher::doPlot(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) {
0997   if(!entry_->field(QStringLiteral("plot")).isEmpty()) return;
0998   // before using localized plot string, look for DOM component
0999   const QRegularExpression sectionRx(QStringLiteral("<section class=\"titlereference-section-overview\">(.+?)</div"),
1000                                      QRegularExpression::DotMatchesEverythingOption);
1001   auto sectionMatch = sectionRx.match(str_);
1002   if(sectionMatch.hasMatch()) {
1003     QString thisPlot = sectionMatch.captured(1);
1004     // TV Series include the episode link first, before the plot, so don't be fooled
1005     if(!thisPlot.contains(QLatin1String("<a href"))) {
1006       thisPlot.remove(*s_tagRx); // remove HTML tags
1007       entry_->setField(QStringLiteral("plot"), thisPlot.simplified());
1008       return;
1009     }
1010   }
1011 
1012   // plot summaries provided by users are on a separate page
1013   // should those be preferred?
1014   bool useUserSummary = false;
1015 
1016   // match until next <p> tag
1017   QString plotRxStr = langData(m_lang).plot + QStringLiteral("(.*)</(p|div|li)");
1018   QRegExp plotRx(plotRxStr, Qt::CaseInsensitive);
1019   plotRx.setMinimal(true);
1020   const QRegularExpression plotUrlRx(QStringLiteral("<a\\s+?[^>]*href\\s*=\\s*\"[^\"]*?/title/[^\"]*?/plotsummary\""),
1021                                      QRegularExpression::CaseInsensitiveOption);
1022   if(plotRx.indexIn(str_) > -1) {
1023     QString thisPlot = plotRx.cap(2);
1024     // if ends with "Written by", remove it. It has an em tag
1025     thisPlot.remove(QRegExp(QStringLiteral("<em class=\"nobr\".*</em>")));
1026     thisPlot.remove(*s_tagRx); // remove HTML tags
1027     thisPlot = thisPlot.simplified();
1028     // if thisPlot ends with (more) or contains
1029     // a url that ends with plotsummary, then we'll grab it, otherwise not
1030     if(thisPlot.isEmpty() ||
1031        plotRx.cap(0).endsWith(QStringLiteral("(more)</")) ||
1032        plotRx.cap(0).contains(plotUrlRx)) {
1033       useUserSummary = true;
1034     } else {
1035       entry_->setField(QStringLiteral("plot"), thisPlot);
1036     }
1037   } else {
1038     useUserSummary = true;
1039   }
1040 
1041   if(useUserSummary) {
1042     auto idMatch = s_titleIdRx->match(baseURL_.path());
1043     Q_ASSERT(idMatch.hasMatch());
1044     QUrl plotURL = baseURL_;
1045     plotURL.setPath(QStringLiteral("/title/") + idMatch.captured(1) + QStringLiteral("/plotsummary"));
1046     QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(plotURL, KIO::NoReload, KIO::HideProgressInfo);
1047     configureJob(getJob);
1048     if(!getJob->exec()) {
1049       myWarning() << "...unable to read" << plotURL;
1050     }
1051     QString plotPage = Tellico::fromHtmlData(getJob->data(), "UTF-8");
1052 
1053     if(!plotPage.isEmpty()) {
1054       const QRegularExpression plotRx1(QStringLiteral("id=\"plot-summaries-content\">(.+)</p"),
1055                                        QRegularExpression::DotMatchesEverythingOption);
1056       QString userPlot;
1057       auto plotMatch = plotRx1.match(plotPage);
1058       if(plotMatch.hasMatch()) {
1059         userPlot = plotMatch.captured(1);
1060       } else {
1061         const QRegularExpression plotRx2(QStringLiteral("<div\\s+id\\s*=\\s*\"swiki.2.1\">(.+?)</d"),
1062                                          QRegularExpression::DotMatchesEverythingOption);
1063         plotMatch = plotRx2.match(plotPage);
1064         if(plotMatch.hasMatch()) {
1065           userPlot = plotMatch.captured(1);
1066          }
1067       }
1068       userPlot.remove(*s_tagRx); // remove HTML tags
1069       // remove last little "written by", if there
1070       userPlot.remove(QRegExp(QStringLiteral("\\s*written by.*$"), Qt::CaseInsensitive));
1071       if(!userPlot.isEmpty()) {
1072         entry_->setField(QStringLiteral("plot"), Tellico::decodeHTML(userPlot.simplified()));
1073       }
1074     }
1075   }
1076 //  myDebug() << "Plot:" << entry_->field(QStringLiteral("plot"));
1077 }
1078 
1079 void IMDBFetcher::doStudio(const QString& str_, Tellico::Data::EntryPtr entry_) {
1080   // match until next opening tag
1081 //  QRegExp productionRx(langData(m_lang).studio, Qt::CaseInsensitive);
1082   QRegExp productionRx(langData(m_lang).studio);
1083   productionRx.setMinimal(true);
1084 
1085   const int pos1 = str_.indexOf(productionRx);
1086   if(pos1 == -1) {
1087 //    myLog() << "No studio found";
1088     return;
1089   }
1090 
1091   int pos2 = str_.indexOf(QStringLiteral("blackcatheader"), pos1, Qt::CaseInsensitive);
1092   if(pos2 == -1) {
1093     pos2 = str_.length();
1094   }
1095   // stop matching when getting to Distributors
1096   int pos3 = str_.indexOf(QStringLiteral("Distributors"), pos1);
1097   if(pos3 > -1 && pos3 < pos2) {
1098     pos2 = pos3;
1099   }
1100 
1101   const QString text = str_.mid(pos1, pos2-pos1);
1102   const QString company = QStringLiteral("/company/");
1103   QStringList studios;
1104   for(int pos = s_anchorRx->indexIn(text); pos > -1; pos = s_anchorRx->indexIn(text, pos+s_anchorRx->matchedLength())) {
1105     const QString cap1 = s_anchorRx->cap(1);
1106     if(cap1.contains(company)) {
1107       studios += s_anchorRx->cap(2).trimmed();
1108     }
1109   }
1110 
1111   entry_->setField(QStringLiteral("studio"), studios.join(FieldFormat::delimiterString()));
1112 }
1113 
1114 void IMDBFetcher::doPerson(const QString& str_, Tellico::Data::EntryPtr entry_,
1115                            const QString& imdbHeader_, const QString& fieldName_) {
1116   // only read if the field value is currently empty
1117   if(!entry_->field(fieldName_).isEmpty()) return;
1118   QRegExp br2Rx(QStringLiteral("<br[\\s/]*>\\s*<br[\\s/]*>"), Qt::CaseInsensitive);
1119   br2Rx.setMinimal(true);
1120   QRegExp divRx(QStringLiteral("<div\\s[^>]*class\\s*=\\s*\"(?:ipl-header__content|info|txt-block)\"[^>]*>(.*)</table"), Qt::CaseInsensitive);
1121   divRx.setMinimal(true);
1122 
1123   const QString name = QStringLiteral("/name/");
1124   QStringList people;
1125   for(int pos = str_.indexOf(divRx); pos > -1; pos = str_.indexOf(divRx, pos+divRx.matchedLength())) {
1126     const QString infoBlock = divRx.cap(1);
1127     if(infoBlock.contains(imdbHeader_, Qt::CaseInsensitive)) {
1128       int pos2 = s_anchorRx->indexIn(infoBlock);
1129       while(pos2 > -1) {
1130         if(s_anchorRx->cap(1).contains(name)) {
1131           people += s_anchorRx->cap(2).trimmed();
1132         }
1133         pos2 = s_anchorRx->indexIn(infoBlock, pos2+s_anchorRx->matchedLength());
1134       }
1135       break;
1136     }
1137   }
1138   if(!people.isEmpty()) {
1139     people.removeDuplicates();
1140     entry_->setField(fieldName_, people.join(FieldFormat::delimiterString()));
1141   }
1142 }
1143 
1144 void IMDBFetcher::doCast(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) {
1145   // the extended cast list is on a separate page
1146   // that's usually a lot of people
1147   // but since it can be in billing order, the main actors might not
1148   // be in the short list
1149   auto idMatch = s_titleIdRx->match(baseURL_.path());
1150   Q_ASSERT(idMatch.hasMatch());
1151   QUrl castURL = baseURL_;
1152   castURL.setPath(QStringLiteral("/title/") + idMatch.captured(1) + QStringLiteral("/fullcredits"));
1153 
1154   // be quiet about failure and be sure to translate entities
1155   QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(castURL, KIO::NoReload, KIO::HideProgressInfo);
1156   configureJob(getJob);
1157   if(!getJob->exec()) {
1158     myWarning() << "...unable to read" << castURL;
1159   }
1160   const QString castPage = Tellico::decodeHTML(Tellico::fromHtmlData(getJob->data(), "UTF-8"));
1161 #if 0
1162   myWarning() << "Remove debug from imdbfetcher.cpp (/tmp/testimdbcast.html)";
1163   QFile f(QString::fromLatin1("/tmp/testimdbcast.html"));
1164   if(f.open(QIODevice::WriteOnly)) {
1165     QTextStream t(&f);
1166     t << castPage;
1167   }
1168   f.close();
1169 #endif
1170 
1171   const LangData& data = langData(m_lang);
1172 
1173   int pos = -1;
1174   // the text to search, depends on which page is being read
1175   QString castText = castPage;
1176   if(castText.isEmpty()) {
1177     // fall back to short list
1178     castText = str_;
1179     pos = castText.indexOf(data.cast1, 0, Qt::CaseInsensitive);
1180     if(pos == -1) {
1181       pos = castText.indexOf(data.cast2, 0, Qt::CaseInsensitive);
1182     }
1183   } else {
1184     // first look for anchor
1185     QRegExp castAnchorRx(QStringLiteral("<a\\s+name\\s*=\\s*\"cast\""), Qt::CaseInsensitive);
1186     pos = castAnchorRx.indexIn(castText);
1187     if(pos < 0) {
1188       QRegExp tableClassRx(QStringLiteral("<table\\s+class\\s*=\\s*\"cast_list\""), Qt::CaseInsensitive);
1189       pos = tableClassRx.indexIn(castText);
1190       if(pos < 0) {
1191         // fragile, the word "cast" appears in the title, but need to find
1192         // the one right above the actual cast table
1193         // for TV shows, there's a link on the sidebar for "episodes case"
1194         // so need to not match that one
1195         const QString castEnd = data.cast + QStringLiteral("</");
1196         pos = castText.indexOf(castEnd, 0, Qt::CaseInsensitive);
1197         if(pos > 9) {
1198           // back up 9 places
1199           if(castText.midRef(pos-9, 9).startsWith(data.episodes)) {
1200             // find next cast list
1201             pos = castText.indexOf(castEnd, pos+6, Qt::CaseInsensitive);
1202           }
1203         }
1204       }
1205     }
1206   }
1207   if(pos == -1) { // no cast list found
1208     myLog() << "no cast list found";
1209     return;
1210   }
1211   // loop until closing table tag
1212   int endPos = castText.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive);
1213   castText = castText.mid(pos, endPos-pos+1);
1214 
1215   QStringList actorList, characterList;
1216   QRegularExpression tdActorRx(QStringLiteral("<td>.*?<a href=\"/name.+?\".*?>(.+?)</a"),
1217                                QRegularExpression::DotMatchesEverythingOption);
1218   QRegularExpression tdCharRx(QStringLiteral("<td class=\"character\">(.+?)</"),
1219                               QRegularExpression::DotMatchesEverythingOption);
1220 
1221   QRegularExpressionMatchIterator i = tdActorRx.globalMatch(castText);
1222   while(i.hasNext()) {
1223     QRegularExpressionMatch match = i.next();
1224     actorList += match.captured(1).simplified();
1225   }
1226   i = tdCharRx.globalMatch(castText);
1227   while(i.hasNext()) {
1228     QRegularExpressionMatch match = i.next();
1229     characterList += match.captured(1).remove(*s_tagRx).simplified();
1230   }
1231 
1232   // sanity check
1233   while(characterList.length() > actorList.length()) {
1234     myDebug() << "Too many characters";
1235     characterList.removeLast();
1236   }
1237   while(characterList.length() < actorList.length()) {
1238     characterList += QString();
1239   }
1240 
1241   QStringList cast;
1242   cast.reserve(actorList.size());
1243   for(int i = 0; i < actorList.size(); ++i) {
1244     cast += actorList.at(i)
1245           + FieldFormat::columnDelimiterString()
1246           + characterList.at(i);
1247     if(cast.count() >= m_numCast) {
1248       break;
1249     }
1250   }
1251 
1252   if(cast.isEmpty()) {
1253     QRegExp tdRx(QStringLiteral("<td[^>]*>(.*)</td>"), Qt::CaseInsensitive);
1254     tdRx.setMinimal(true);
1255 
1256     QRegExp tdActorRx(QStringLiteral("<td\\s+[^>]*itemprop=\"actor\"[^>]*>(.*)</td>"), Qt::CaseInsensitive);
1257     tdActorRx.setMinimal(true);
1258 
1259     QRegExp tdCharRx(QStringLiteral("<td\\s+[^>]*class=\"character\"[^>]*>(.*)</td>"), Qt::CaseInsensitive);
1260     tdCharRx.setMinimal(true);
1261 
1262     pos = tdActorRx.indexIn(castText);
1263     while(pos > -1 && cast.count() < m_numCast) {
1264       QString actorText = tdActorRx.cap(1).remove(*s_tagRx).simplified();
1265       const int pos2 = tdCharRx.indexIn(castText, pos+1);
1266       if(pos2 > -1) {
1267         cast += actorText
1268               + FieldFormat::columnDelimiterString()
1269               + tdCharRx.cap(1).remove(*s_tagRx).simplified();
1270       }
1271       pos = tdActorRx.indexIn(castText, qMax(pos+1, pos2));
1272     }
1273   }
1274 
1275   if(!cast.isEmpty()) {
1276     entry_->setField(QStringLiteral("cast"), cast.join(FieldFormat::rowDelimiterString()));
1277   }
1278 
1279   // also do other items from fullcredits page, like producer
1280   pos = castPage.indexOf(QLatin1String("id=\"producer\""), 0, Qt::CaseInsensitive);
1281   if(pos > -1) {
1282     int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive);
1283     if(endPos == -1) {
1284       endPos = castPage.length();
1285     }
1286     const QString prodText = castPage.mid(pos, endPos-pos+1);
1287     QRegExp tdCharRx(QStringLiteral("<td\\s+[^>]*class=\"credit\"[^>]*>(.*)</td>"));
1288     tdCharRx.setMinimal(true);
1289 
1290     QStringList producers;
1291     pos = s_anchorNameRx->indexIn(prodText);
1292     while(pos > -1 && producers.count() < IMDB_MAX_PERSON_COUNT) {
1293       const int pos2 = tdCharRx.indexIn(prodText, pos+1);
1294       const QString credit = tdCharRx.cap(1).trimmed();
1295       if(pos2 > -1 && (credit.startsWith(QStringLiteral("producer")) ||
1296                        credit.startsWith(QStringLiteral("co-producer")) ||
1297                        credit.startsWith(QStringLiteral("associate producer")))) {
1298         producers += s_anchorNameRx->cap(2).trimmed();
1299       }
1300       pos = s_anchorNameRx->indexIn(prodText, pos+1);
1301     }
1302     if(!producers.isEmpty()) {
1303       entry_->setField(QStringLiteral("producer"), producers.join(FieldFormat::delimiterString()));
1304     }
1305   }
1306 
1307   const QString director = QStringLiteral("director");
1308   // only try to read director if its already empty, which means it wasn't found on main page
1309   if(entry_->field(director).isEmpty()) {
1310     QStringList directors;
1311     pos = castPage.indexOf(QLatin1String("id=\"director\""), 0, Qt::CaseInsensitive);
1312     if(pos > -1 && directors.count() < IMDB_MAX_PERSON_COUNT) {
1313       int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive);
1314       if(endPos == -1) {
1315         endPos = castPage.length();
1316       }
1317       const QString midText = castPage.mid(pos, endPos-pos+1);
1318       pos = s_anchorNameRx->indexIn(midText);
1319       while(pos > -1) {
1320         directors += s_anchorNameRx->cap(2).trimmed();
1321         pos = s_anchorNameRx->indexIn(midText, pos+1);
1322       }
1323     }
1324     if(!directors.isEmpty()) {
1325       entry_->setField(director, directors.join(FieldFormat::delimiterString()));
1326     }
1327   }
1328 
1329   const QString writer = QStringLiteral("writer");
1330   // only try to read director if its already empty, which means it wasn't found on main page
1331   if(entry_->field(writer).isEmpty()) {
1332     QStringList writers;
1333     pos = castPage.indexOf(QLatin1String("id=\"writer\""), 0, Qt::CaseInsensitive);
1334     if(pos > -1 && writers.count() < IMDB_MAX_PERSON_COUNT) {
1335       int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive);
1336       if(endPos == -1) {
1337         endPos = castPage.length();
1338       }
1339       const QString midText = castPage.mid(pos, endPos-pos+1);
1340       pos = s_anchorNameRx->indexIn(midText);
1341       while(pos > -1) {
1342         writers += s_anchorNameRx->cap(2).trimmed();
1343         pos = s_anchorNameRx->indexIn(midText, pos+1);
1344       }
1345     }
1346     writers.removeDuplicates(); // some editor/writer duplicates
1347     if(!writers.isEmpty()) {
1348       entry_->setField(writer, writers.join(FieldFormat::delimiterString()));
1349     }
1350   }
1351 
1352   const QString composer = QStringLiteral("composer");
1353   // only try to read director if its already empty, which means it wasn't found on main page
1354   if(entry_->field(composer).isEmpty()) {
1355     QStringList composers;
1356     pos = castPage.indexOf(QLatin1String("id=\"composer\""), 0, Qt::CaseInsensitive);
1357     if(pos > -1 && composers.count() < IMDB_MAX_PERSON_COUNT) {
1358       int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive);
1359       if(endPos == -1) {
1360         endPos = castPage.length();
1361       }
1362       const QString midText = castPage.mid(pos, endPos-pos+1);
1363       pos = s_anchorNameRx->indexIn(midText);
1364       while(pos > -1) {
1365         composers += s_anchorNameRx->cap(2).trimmed();
1366         pos = s_anchorNameRx->indexIn(midText, pos+1);
1367       }
1368     }
1369     if(!composers.isEmpty()) {
1370       entry_->setField(composer, composers.join(FieldFormat::delimiterString()));
1371     }
1372   }
1373 }
1374 
1375 void IMDBFetcher::doRating(const QString& str_, Tellico::Data::EntryPtr entry_) {
1376   if(!optionalFields().contains(QStringLiteral("imdb-rating"))) {
1377     return;
1378   }
1379 
1380   QRegExp divRx(QStringLiteral("<div class=\"ipl-rating-star[\\s\"]+>(.*)</div"), Qt::CaseInsensitive);
1381   divRx.setMinimal(true);
1382 
1383   if(divRx.indexIn(str_) > -1) {
1384     if(!entry_->collection()->hasField(QStringLiteral("imdb-rating"))) {
1385       Data::FieldPtr f(new Data::Field(QStringLiteral("imdb-rating"), i18n("IMDb Rating"), Data::Field::Rating));
1386       f->setCategory(i18n("General"));
1387       f->setProperty(QStringLiteral("maximum"), QStringLiteral("10"));
1388       entry_->collection()->addField(f);
1389     }
1390 
1391     QString text = divRx.cap(0);
1392     text.remove(*s_tagRx);
1393 
1394     QRegExp ratingRx(QStringLiteral("\\s(\\d+.?\\d*)\\s"));
1395     if(ratingRx.indexIn(text) > -1) {
1396       bool ok;
1397       float value = ratingRx.cap(1).toFloat(&ok);
1398       if(!ok) {
1399         value = QLocale().toFloat(ratingRx.cap(1), &ok);
1400       }
1401       if(ok) {
1402         entry_->setField(QStringLiteral("imdb-rating"), QString::number(value));
1403       }
1404     }
1405   }
1406 }
1407 
1408 void IMDBFetcher::doCover(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) {
1409   QRegExp imgRx(QStringLiteral("<img\\s+[^>]*src\\s*=\\s*\"([^\"]*)\"[^>]*>"), Qt::CaseInsensitive);
1410   imgRx.setMinimal(true);
1411 
1412   QRegExp posterRx(QStringLiteral("<a\\s+[^>]*name\\s*=\\s*\"poster\"[^>]*>(.*)</a>"), Qt::CaseInsensitive);
1413   posterRx.setMinimal(true);
1414 
1415   const QString cover = QStringLiteral("cover");
1416 
1417   int pos = posterRx.indexIn(str_);
1418   while(pos > -1) {
1419     if(posterRx.cap(1).contains(imgRx)) {
1420       QUrl u = QUrl(baseURL_).resolved(QUrl(imgRx.cap(1)));
1421       QString id = ImageFactory::addImage(u, true);
1422       if(!id.isEmpty()) {
1423         entry_->setField(cover, id);
1424         return;
1425       }
1426     }
1427     pos = posterRx.indexIn(str_, pos+posterRx.matchedLength());
1428   }
1429 
1430   // <link rel='image_src'
1431   const QRegularExpression linkRx(QStringLiteral("<link (.+?)>"));
1432   const QRegularExpression hrefRx(QStringLiteral("href=['\"](.+?)['\"]"));
1433 
1434   const QString src = QStringLiteral("image_src");
1435   auto i = linkRx.globalMatch(str_);
1436   while(i.hasNext()) {
1437     auto match = i.next();
1438     const auto tag = match.capturedRef(1);
1439     if(tag.contains(src, Qt::CaseInsensitive)) {
1440       auto hrefMatch = hrefRx.match(tag);
1441       if(hrefMatch.hasMatch()) {
1442         QUrl u = QUrl(baseURL_).resolved(QUrl(hrefMatch.captured(1)));
1443         // imdb uses amazon media image, where the img src "encodes" requests for image sizing and cropping
1444         // strip everything after the "@." and add UY64 to limit the max image dimension to 640
1445         int n = u.url().indexOf(QStringLiteral("@."));
1446         if(n > -1) {
1447           const QString newLink = u.url().left(n) + QStringLiteral("@.UY640.jpg");
1448           const QString id = ImageFactory::addImage(QUrl(newLink), true);
1449           if(!id.isEmpty()) {
1450             entry_->setField(cover, id);
1451             return;
1452           }
1453         }
1454         const QString id = ImageFactory::addImage(u, true);
1455         if(!id.isEmpty()) {
1456           entry_->setField(cover, id);
1457           return;
1458         }
1459       }
1460     }
1461   }
1462 
1463   // <img alt="poster"
1464   posterRx.setPattern(QStringLiteral("<img\\s+[^>]*alt\\s*=\\s*\"poster\"[^>]+src\\s*=\\s*\"([^\"]+)\""));
1465   pos = posterRx.indexIn(str_);
1466   if(pos > -1) {
1467     QUrl u = QUrl(baseURL_).resolved(QUrl(posterRx.cap(1)));
1468     QString id = ImageFactory::addImage(u, true);
1469     if(!id.isEmpty()) {
1470       entry_->setField(cover, id);
1471       return;
1472     }
1473   }
1474 
1475   // didn't find the cover, IMDb also used to put "cover" inside the url
1476   // cover is the img with the "cover" alt text
1477   pos = imgRx.indexIn(str_);
1478   while(pos > -1) {
1479     const QString url = imgRx.cap(0).toLower();
1480     if(url.contains(cover)) {
1481       QUrl u = QUrl(baseURL_).resolved(QUrl(imgRx.cap(1)));
1482       QString id = ImageFactory::addImage(u, true);
1483       if(!id.isEmpty()) {
1484         entry_->setField(cover, id);
1485         return;
1486       }
1487     }
1488     pos = imgRx.indexIn(str_, pos+imgRx.matchedLength());
1489   }
1490 }
1491 
1492 void IMDBFetcher::doLists2(const QString& str_, Tellico::Data::EntryPtr entry_) {
1493   QRegExp divInfoRx(QStringLiteral("<li role=\"presentation\".*>(.*)</div"), Qt::CaseInsensitive);
1494   divInfoRx.setMinimal(true);
1495 
1496   const LangData& data = langData(m_lang);
1497 
1498   QStringList genres, countries, langs, certs, tracks;
1499   for(int pos = divInfoRx.indexIn(str_); pos > -1; pos = divInfoRx.indexIn(str_, pos+divInfoRx.matchedLength())) {
1500     QString divMatch = divInfoRx.cap(1);
1501     int pos2 = 0;
1502     if((pos2=s_anchorRx->indexIn(divMatch)) == -1) continue;
1503     const QString text = divMatch.remove(*s_tagRx);
1504     QString value = s_anchorRx->cap(2);
1505 
1506     if(text.startsWith(data.genre)) {
1507       foreach(const QString& token, value.split(QLatin1Char('|'))) {
1508         genres << token.trimmed();
1509       }
1510     } else if(text.startsWith(data.language)) {
1511       foreach(const QString& token, value.split(QRegExp(QLatin1String("[,|]")))) {
1512         langs << token.trimmed();
1513       }
1514     } else if(text.startsWith(data.sound)) {
1515       foreach(const QString& token, value.split(QLatin1Char('|'))) {
1516         tracks << token.trimmed();
1517       }
1518     } else if(text.startsWith(data.country)) {
1519       countries << value;
1520     } else if(text.startsWith(data.certification)) {
1521       foreach(const QString& token, value.split(QLatin1Char('|'))) {
1522         certs << token.trimmed();
1523       }
1524     } else if(text.startsWith(data.color)) {
1525       // cut off any parentheses
1526       value = value.section(QLatin1Char('('), 0, 0).trimmed();
1527       // change "black and white" to "black & white"
1528       value.replace(QStringLiteral("and"), QStringLiteral("&"));
1529       if(value == data.color) {
1530         entry_->setField(QStringLiteral("color"), i18n("Color"));
1531       } else {
1532         entry_->setField(QStringLiteral("color"), value);
1533       }
1534     }
1535   }
1536 
1537   if(!genres.isEmpty()) {
1538     entry_->setField(QStringLiteral("genre"), genres.join(FieldFormat::delimiterString()));
1539   }
1540   if(!countries.isEmpty()) {
1541     entry_->setField(QStringLiteral("nationality"), countries.join(FieldFormat::delimiterString()));
1542   }
1543   if(!langs.isEmpty()) {
1544     entry_->setField(QStringLiteral("language"), langs.join(FieldFormat::delimiterString()));
1545   }
1546   if(!tracks.isEmpty()) {
1547     entry_->setField(QStringLiteral("audio-track"), tracks.join(FieldFormat::delimiterString()));
1548   }
1549   if(!certs.isEmpty()) {
1550     // first try to set default certification
1551     const QStringList& certsAllowed = entry_->collection()->fieldByName(QStringLiteral("certification"))->allowed();
1552     foreach(const QString& cert, certs) {
1553       QString country = cert.section(QLatin1Char(':'), 0, 0);
1554       QString lcert = cert.section(QLatin1Char(':'), 1, 1);
1555       if(lcert == QStringLiteral("Unrated")) {
1556         lcert = QLatin1Char('U');
1557       }
1558       lcert += QStringLiteral(" (") + country + QLatin1Char(')');
1559       if(certsAllowed.contains(lcert)) {
1560         entry_->setField(QStringLiteral("certification"), lcert);
1561         break;
1562       }
1563     }
1564 
1565     // now add new field for all certifications
1566     const QString allc = QStringLiteral("allcertification");
1567     if(optionalFields().contains(allc)) {
1568       Data::FieldPtr f = entry_->collection()->fieldByName(allc);
1569       if(!f) {
1570         f = new Data::Field(allc, i18n("Certifications"), Data::Field::Table);
1571         f->setFlags(Data::Field::AllowGrouped);
1572         entry_->collection()->addField(f);
1573       }
1574       entry_->setField(QStringLiteral("allcertification"), certs.join(FieldFormat::rowDelimiterString()));
1575     }
1576   }
1577 }
1578 
1579 // look at every anchor tag in the string
1580 void IMDBFetcher::doLists(const QString& str_, Tellico::Data::EntryPtr entry_) {
1581   const QString genre = QStringLiteral("/Genres/");
1582   const QString genre2 = QStringLiteral("/genre/");
1583   const QString country = QStringLiteral("/country/");
1584   const QString lang = QStringLiteral("/language/");
1585   const QString colorInfo = QStringLiteral("colors=");
1586   const QString cert = QStringLiteral("certificates=");
1587   const QString soundMix = QStringLiteral("sound_mixes=");
1588   const QString year = QStringLiteral("/Years/");
1589 
1590   // if we reach faqs or user comments, we can stop
1591   const QString faqs = QStringLiteral("/faq");
1592   const QString users = QStringLiteral("/user/");
1593   // IMdb also has links with the word "sections" in them, remove that
1594   // for genres and nationalities
1595 
1596   int startPos = str_.indexOf(QStringLiteral("<div id=\"pagecontent\">"));
1597   if(startPos == -1) {
1598     startPos = 0;
1599   }
1600 
1601   QStringList genres, countries, langs, certs, tracks;
1602   for(int pos = s_anchorRx->indexIn(str_, startPos); pos > -1; pos = s_anchorRx->indexIn(str_, pos+s_anchorRx->matchedLength())) {
1603     const QString cap1 = s_anchorRx->cap(1);
1604     if(cap1.contains(genre) || cap1.contains(genre2)) {
1605       const QString g = s_anchorRx->cap(2);
1606       if(!g.contains(QStringLiteral(" section"), Qt::CaseInsensitive) &&
1607          !g.contains(QStringLiteral(" genre"), Qt::CaseInsensitive)) {
1608         // ignore "Most Popular by Genre"
1609         genres += g.trimmed();
1610       }
1611     } else if(cap1.contains(country)) {
1612       if(!s_anchorRx->cap(2).contains(QStringLiteral(" section"), Qt::CaseInsensitive)) {
1613         countries += s_anchorRx->cap(2).trimmed();
1614       }
1615     } else if(cap1.contains(lang) && !cap1.contains(QStringLiteral("contribute"))) {
1616       langs += s_anchorRx->cap(2).trimmed();
1617     } else if(cap1.contains(colorInfo)) {
1618       QString value = s_anchorRx->cap(2);
1619       // cut off any parentheses
1620       value = value.section(QLatin1Char('('), 0, 0).trimmed();
1621       // change "black and white" to "black & white"
1622       value.replace(QStringLiteral("and"), QStringLiteral("&"));
1623       entry_->setField(QStringLiteral("color"), value.trimmed());
1624     } else if(cap1.contains(cert)) {
1625       certs += s_anchorRx->cap(2).trimmed();
1626     } else if(cap1.contains(soundMix)) {
1627       tracks += s_anchorRx->cap(2).trimmed();
1628       // if year field wasn't set before, do it now
1629     } else if(entry_->field(QStringLiteral("year")).isEmpty() && cap1.contains(year)) {
1630       entry_->setField(QStringLiteral("year"), s_anchorRx->cap(2).trimmed());
1631     } else if((cap1.contains(faqs) || cap1.contains(users)) && !genres.isEmpty()) {
1632       break;
1633     }
1634   }
1635 
1636   // since we have multiple genre search strings
1637   genres.removeDuplicates();
1638 
1639   entry_->setField(QStringLiteral("genre"), genres.join(FieldFormat::delimiterString()));
1640   entry_->setField(QStringLiteral("nationality"), countries.join(FieldFormat::delimiterString()));
1641   entry_->setField(QStringLiteral("language"), langs.join(FieldFormat::delimiterString()));
1642   entry_->setField(QStringLiteral("audio-track"), tracks.join(FieldFormat::delimiterString()));
1643   if(!certs.isEmpty()) {
1644     // first try to set default certification
1645     const QStringList& certsAllowed = entry_->collection()->fieldByName(QStringLiteral("certification"))->allowed();
1646     foreach(const QString& cert, certs) {
1647       QString country = cert.section(QLatin1Char(':'), 0, 0);
1648       if(country == QStringLiteral("United States")) {
1649         country = QStringLiteral("USA");
1650       }
1651       QString lcert = cert.section(QLatin1Char(':'), 1, 1);
1652       if(lcert == QStringLiteral("Unrated")) {
1653         lcert = QLatin1Char('U');
1654       }
1655       lcert += QStringLiteral(" (") + country + QLatin1Char(')');
1656       if(certsAllowed.contains(lcert)) {
1657         entry_->setField(QStringLiteral("certification"), lcert);
1658         break;
1659       }
1660     }
1661 
1662     // now add new field for all certifications
1663     const QString allc = QStringLiteral("allcertification");
1664     if(optionalFields().contains(allc)) {
1665       Data::FieldPtr f = entry_->collection()->fieldByName(allc);
1666       if(!f) {
1667         f = new Data::Field(allc, i18n("Certifications"), Data::Field::Table);
1668         f->setFlags(Data::Field::AllowGrouped);
1669         entry_->collection()->addField(f);
1670       }
1671       entry_->setField(QStringLiteral("allcertification"), certs.join(FieldFormat::rowDelimiterString()));
1672     }
1673   }
1674 }
1675 
1676 void IMDBFetcher::doEpisodes(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) {
1677   if(!str_.contains(QStringLiteral("video.tv_show"))) {
1678     // depend on meta data to indicate TV series
1679     // should include <meta property='og:type' content="video.tv_show" /> in the reference view
1680     return;
1681   }
1682   const QString episode = QStringLiteral("episode");
1683   if(!entry_->collection()->hasField(episode)) {
1684     entry_->collection()->addField(Data::Field::createDefaultField(Data::Field::EpisodeField));
1685   }
1686 
1687   int currentSeason = 1;
1688   int totalSeasons = -1;
1689   QStringList episodes;
1690 
1691   // the episode list is on a separate page
1692   auto idMatch = s_titleIdRx->match(baseURL_.path());
1693   Q_ASSERT(idMatch.hasMatch());
1694 
1695   const QRegularExpression episodeRx(QStringLiteral("itemtype=\"http://schema.org/TVEpisode\""));
1696   const QRegularExpression anchorEpisodeRx(QStringLiteral("<a href=\"/title/.+?_ep(\\d+)\"\\s+title=\"(.+?)\""),
1697                                            QRegularExpression::DotMatchesEverythingOption);
1698   QUrl episodeUrl = baseURL_;
1699   episodeUrl.setPath(QStringLiteral("/title/") + idMatch.captured(1) + QStringLiteral("/episodes/_ajax"));
1700   QUrlQuery q;
1701   // loop over the total number of seasons
1702   do {
1703     q.clear();
1704     q.addQueryItem(QLatin1String("season"), QString::number(currentSeason));
1705     episodeUrl.setQuery(q);
1706 
1707     QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(episodeUrl, KIO::NoReload, KIO::HideProgressInfo);
1708     configureJob(getJob);
1709     if(!getJob->exec()) {
1710       myWarning() << "...unable to read" << episodeUrl;
1711     }
1712     const QString episodeText = Tellico::fromHtmlData(getJob->data(), "UTF-8");
1713 #if 0
1714     myWarning() << "Remove debug from imdbfetcher.cpp (/tmp/testimdbepisodes.html)";
1715     QFile f(QString::fromLatin1("/tmp/testimdbepisodes.html"));
1716     if(f.open(QIODevice::WriteOnly)) {
1717       QTextStream t(&f);
1718       t << castPage;
1719     }
1720     f.close();
1721 #endif
1722 
1723     if(totalSeasons == -1) {
1724       // assume never more than 99 seasons, alternative is 4-digit years
1725       static const QRegularExpression optionRx(QStringLiteral("<option\\s+value=\"(\\d\\d?)\""));
1726       auto iOption = optionRx.globalMatch(episodeText);
1727       while(iOption.hasNext()) {
1728         auto optionMatch = iOption.next();
1729         const int value = optionMatch.captured(1).toInt();
1730         if(value > totalSeasons) totalSeasons = value;
1731       }
1732       totalSeasons = qMin(totalSeasons, IMDB_MAX_SEASON_COUNT);
1733      // ok if totalSeasons remains == -1
1734 //      myDebug() << "Total seasons:" << totalSeasons;
1735     }
1736 
1737     auto i = episodeRx.globalMatch(episodeText);
1738     while(i.hasNext()) {
1739       auto match = i.next();
1740       auto anchorMatch = anchorEpisodeRx.match(episodeText, match.capturedEnd());
1741       if(anchorMatch.hasMatch()) {
1742 //        myDebug() << "found episode" << anchorMatch.captured(1) << anchorMatch.captured(2);
1743         episodes << anchorMatch.captured(2) + FieldFormat::columnDelimiterString() +
1744                     QString::number(currentSeason) + FieldFormat::columnDelimiterString() +
1745                     anchorMatch.captured(1);
1746       }
1747     }
1748     ++currentSeason;
1749   } while (totalSeasons > 0 && currentSeason < totalSeasons);
1750 
1751   entry_->setField(episode, episodes.join(FieldFormat::rowDelimiterString()));
1752 }
1753 
1754 Tellico::Fetch::FetchRequest IMDBFetcher::updateRequest(Data::EntryPtr entry_) {
1755   QUrl link = QUrl::fromUserInput(entry_->field(QStringLiteral("imdb")));
1756 
1757   if(!link.isEmpty() && link.isValid()) {
1758     if(link.host() != m_host) {
1759 //      myLog() << "switching hosts to " << m_host;
1760       link.setHost(m_host);
1761     }
1762     return FetchRequest(Fetch::Raw, link.url());
1763   }
1764 
1765   // optimistically try searching for title and rely on Collection::sameEntry() to figure things out
1766   const QString t = entry_->field(QStringLiteral("title"));
1767   if(!t.isEmpty()) {
1768     return FetchRequest(Fetch::Title, t);
1769   }
1770   return FetchRequest();
1771 }
1772 
1773 void IMDBFetcher::configureJob(QPointer<KIO::StoredTransferJob> job_) {
1774   KJobWidgets::setWindow(job_, GUI::Proxy::widget());
1775   switch(m_lang) {
1776     case EN:
1777       job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("en-US")); break;
1778     case FR:
1779       job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("fr-FR")); break;
1780     case ES:
1781       job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("es-ES")); break;
1782     case DE:
1783       job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("de-DE")); break;
1784     case IT:
1785       job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("it-IT")); break;
1786     case PT:
1787       job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("pt-PT")); break;
1788   }
1789 }
1790 
1791 QString IMDBFetcher::defaultName() {
1792   return i18n("Internet Movie Database");
1793 }
1794 
1795 QString IMDBFetcher::defaultIcon() {
1796   return favIcon("https://www.imdb.com");
1797 }
1798 
1799 //static
1800 Tellico::StringHash IMDBFetcher::allOptionalFields() {
1801   StringHash hash;
1802   hash[QStringLiteral("imdb")]             = i18n("IMDb Link");
1803   hash[QStringLiteral("imdb-rating")]      = i18n("IMDb Rating");
1804   hash[QStringLiteral("alttitle")]         = i18n("Alternative Titles");
1805   hash[QStringLiteral("allcertification")] = i18n("Certifications");
1806   hash[QStringLiteral("origtitle")]        = i18n("Original Title");
1807   hash[QStringLiteral("episode")]          = i18n("Episodes");
1808   return hash;
1809 }
1810 
1811 Tellico::Fetch::ConfigWidget* IMDBFetcher::configWidget(QWidget* parent_) const {
1812   return new IMDBFetcher::ConfigWidget(parent_, this);
1813 }
1814 
1815 IMDBFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const IMDBFetcher* fetcher_/*=0*/)
1816     : Fetch::ConfigWidget(parent_) {
1817   QGridLayout* l = new QGridLayout(optionsWidget());
1818   l->setSpacing(4);
1819   l->setColumnStretch(1, 10);
1820 
1821   int row = -1;
1822 
1823   QLabel* label = new QLabel(i18n("&Maximum cast: "), optionsWidget());
1824   l->addWidget(label, ++row, 0);
1825   m_numCast = new QSpinBox(optionsWidget());
1826   m_numCast->setMaximum(99);
1827   m_numCast->setMinimum(0);
1828   m_numCast->setValue(IMDB_DEFAULT_CAST_SIZE);
1829 #if (QT_VERSION < QT_VERSION_CHECK(5, 14, 0))
1830   void (QSpinBox::* textChanged)(const QString&) = &QSpinBox::valueChanged;
1831 #else
1832   void (QSpinBox::* textChanged)(const QString&) = &QSpinBox::textChanged;
1833 #endif
1834   connect(m_numCast, textChanged, this, &ConfigWidget::slotSetModified);
1835   l->addWidget(m_numCast, row, 1);
1836   QString w = i18n("The list of cast members may include many people. Set the maximum number returned from the search.");
1837   label->setWhatsThis(w);
1838   m_numCast->setWhatsThis(w);
1839   label->setBuddy(m_numCast);
1840 
1841   m_fetchImageCheck = new QCheckBox(i18n("Download cover &image"), optionsWidget());
1842   connect(m_fetchImageCheck, &QAbstractButton::clicked, this, &ConfigWidget::slotSetModified);
1843   ++row;
1844   l->addWidget(m_fetchImageCheck, row, 0, 1, 2);
1845   w = i18n("The cover image may be downloaded as well. However, too many large images in the "
1846            "collection may degrade performance.");
1847   m_fetchImageCheck->setWhatsThis(w);
1848 
1849   l->setRowStretch(++row, 10);
1850 
1851   // now add additional fields widget
1852   addFieldsWidget(IMDBFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList());
1853   KAcceleratorManager::manage(optionsWidget());
1854 
1855   if(fetcher_) {
1856     m_numCast->setValue(fetcher_->m_numCast);
1857     m_fetchImageCheck->setChecked(fetcher_->m_fetchImages);
1858   } else { //defaults
1859     m_fetchImageCheck->setChecked(true);
1860   }
1861 }
1862 
1863 void IMDBFetcher::ConfigWidget::saveConfigHook(KConfigGroup& config_) {
1864   config_.writeEntry("Host", QString()); // clear old host entry
1865   config_.writeEntry("Max Cast", m_numCast->value());
1866   config_.writeEntry("Fetch Images", m_fetchImageCheck->isChecked());
1867 }
1868 
1869 QString IMDBFetcher::ConfigWidget::preferredName() const {
1870   return IMDBFetcher::langData(EN).siteTitle;
1871 }
1872 
1873 void IMDBFetcher::ConfigWidget::slotSiteChanged() {
1874   emit signalName(preferredName());
1875 }