File indexing completed on 2024-05-19 16:18:48
0001 /*************************************************************************** 0002 Copyright (C) 2004-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "imdbfetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../collections/videocollection.h" 0028 #include "../entry.h" 0029 #include "../field.h" 0030 #include "../fieldformat.h" 0031 #include "../images/imagefactory.h" 0032 #include "../utils/string_utils.h" 0033 #include "../tellico_debug.h" 0034 0035 #include <KLocalizedString> 0036 #include <KConfigGroup> 0037 #include <KIO/Job> 0038 #include <KJobUiDelegate> 0039 #include <KAcceleratorManager> 0040 #include <KJobWidgets/KJobWidgets> 0041 0042 #include <QSpinBox> 0043 #include <QRegExp> 0044 #include <QFile> 0045 #include <QMap> 0046 #include <QLabel> 0047 #include <QCheckBox> 0048 #include <QGroupBox> 0049 #include <QGridLayout> 0050 #include <QUrlQuery> 0051 #include <QJsonDocument> 0052 #include <QJsonParseError> 0053 #include <QJsonObject> 0054 #include <QRegularExpression> 0055 0056 namespace { 0057 static const uint IMDB_MAX_RESULTS = 20; 0058 static const uint IMDB_DEFAULT_CAST_SIZE = 10; 0059 static const int IMDB_MAX_PERSON_COUNT = 5; // limit number of directors, writers, etc, esp for TV series 0060 static const int IMDB_MAX_SEASON_COUNT = 5; // simply takes too long otherwise 0061 } 0062 0063 using namespace Tellico; 0064 using Tellico::Fetch::IMDBFetcher; 0065 0066 QRegExp* IMDBFetcher::s_tagRx = nullptr; 0067 QRegExp* IMDBFetcher::s_anchorRx = nullptr; 0068 QRegExp* IMDBFetcher::s_anchorTitleRx = nullptr; 0069 QRegExp* IMDBFetcher::s_anchorNameRx = nullptr; 0070 QRegExp* IMDBFetcher::s_titleRx = nullptr; 0071 const QRegularExpression* IMDBFetcher::s_titleIdRx = nullptr; 0072 int IMDBFetcher::s_instanceCount = 0; 0073 0074 // static 0075 void IMDBFetcher::initRegExps() { 0076 s_tagRx = new QRegExp(QStringLiteral("<.*>")); 0077 s_tagRx->setMinimal(true); 0078 0079 s_anchorRx = new QRegExp(QStringLiteral("<a\\s+[^>]*href\\s*=\\s*\"([^\"]+)\"[^<]*>([^<]+)</a>"), Qt::CaseInsensitive); 0080 s_anchorRx->setMinimal(true); 0081 0082 s_anchorTitleRx = new QRegExp(QStringLiteral("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*/title/[^\"]*)\"[^<]*>([^<]*)</a>"), Qt::CaseInsensitive); 0083 s_anchorTitleRx->setMinimal(true); 0084 0085 s_anchorNameRx = new QRegExp(QStringLiteral("<a\\s+[^>]*href\\s*=\\s*\"([^\"]*/name/[^\"]*)\"[^<]*>(.+)</a>"), Qt::CaseInsensitive); 0086 s_anchorNameRx->setMinimal(true); 0087 0088 s_titleRx = new QRegExp(QStringLiteral("<title>(.*)</title>"), Qt::CaseInsensitive); 0089 s_titleRx->setMinimal(true); 0090 0091 s_titleIdRx = new QRegularExpression(QStringLiteral("title/(tt\\d+)")); 0092 } 0093 0094 void IMDBFetcher::deleteRegExps() { 0095 delete s_tagRx; 0096 s_tagRx = nullptr; 0097 0098 delete s_anchorRx; 0099 s_anchorRx = nullptr; 0100 0101 delete s_anchorTitleRx; 0102 s_anchorTitleRx = nullptr; 0103 0104 delete s_anchorNameRx; 0105 s_anchorNameRx = nullptr; 0106 0107 delete s_titleRx; 0108 s_titleRx = nullptr; 0109 0110 delete s_titleIdRx; 0111 s_titleIdRx = nullptr; 0112 } 0113 0114 // static 0115 const IMDBFetcher::LangData& IMDBFetcher::langData(int lang_) { 0116 Q_ASSERT(lang_ >= 0); 0117 Q_ASSERT(lang_ < 6); 0118 static LangData dataVector[6] = { 0119 { 0120 i18n("Internet Movie Database"), 0121 QStringLiteral("findSectionHeader"), 0122 QStringLiteral("Exact Matches"), 0123 QStringLiteral("Partial Matches"), 0124 QStringLiteral("Approx Matches"), 0125 QStringLiteral("findSectionHeader"), 0126 QStringLiteral("Other Results"), 0127 QStringLiteral("aka"), 0128 QStringLiteral("Directed by"), 0129 QStringLiteral("Written by"), 0130 QStringLiteral("Produced by"), 0131 QStringLiteral("runtime.*(\\d+)\\s+min"), 0132 QStringLiteral("aspect ratio"), 0133 QStringLiteral("also known as"), 0134 QStringLiteral("Production Co"), 0135 QStringLiteral("cast"), 0136 QStringLiteral("cast overview"), 0137 QStringLiteral("credited cast"), 0138 QStringLiteral("episodes"), 0139 QStringLiteral("Genre"), 0140 QStringLiteral("Sound"), 0141 QStringLiteral("Color"), 0142 QStringLiteral("Language"), 0143 QStringLiteral("Certification"), 0144 QStringLiteral("Country"), 0145 QStringLiteral("plot\\s+(outline|summary)(?!/)"), 0146 QStringLiteral("Music by") 0147 }, { 0148 i18n("Internet Movie Database (French)"), 0149 QStringLiteral("findSectionHeader"), 0150 QStringLiteral("Résultats Exacts"), 0151 QStringLiteral("Résultats Partiels"), 0152 QStringLiteral("Résultats Approximatif"), 0153 QStringLiteral("findSectionHeader"), 0154 QStringLiteral("Résultats Autres"), 0155 QStringLiteral("autre titre"), 0156 QStringLiteral("Réalisateur"), 0157 QStringLiteral("Scénarist"), 0158 QString(), 0159 QStringLiteral("Durée.*(\\d+)\\s+heur.*\\s+(\\d+)\\s+min"), 0160 QStringLiteral("Proportions de l’image"), 0161 QStringLiteral("Alias"), 0162 QStringLiteral("Sociétés de production"), 0163 QStringLiteral("Ensemble"), 0164 QStringLiteral("cast overview"), // couldn't get phrase 0165 QStringLiteral("credited cast"), // couldn't get phrase 0166 QStringLiteral("episodes"), 0167 QStringLiteral("Genre"), 0168 QStringLiteral("Mixage audio"), 0169 QStringLiteral("Couleur"), 0170 QStringLiteral("Langue"), 0171 QStringLiteral("Classification"), 0172 QStringLiteral("Pays d’origine"), 0173 QStringLiteral("Intrigue\\s*"), 0174 QString() // reference page doesn't seem to have localized composer 0175 }, { 0176 i18n("Internet Movie Database (Spanish)"), 0177 QStringLiteral("findSectionHeader"), 0178 QStringLiteral("Resultados Exactos"), 0179 QStringLiteral("Resultados Parciales"), 0180 QStringLiteral("Resultados Aproximados"), 0181 QStringLiteral("findSectionHeader"), 0182 QStringLiteral("Resultados Otros"), 0183 QStringLiteral("otro título"), 0184 QStringLiteral("Director"), 0185 QStringLiteral("Escritores"), 0186 QString(), 0187 QStringLiteral("Duración.*(\\d+)\\s+min"), 0188 QStringLiteral("Relación de Aspecto"), 0189 QStringLiteral("Conocido como"), 0190 QStringLiteral("Compañías Productores"), 0191 QStringLiteral("Reparto"), 0192 QStringLiteral("cast overview"), // couldn't get phrase 0193 QStringLiteral("credited cast"), // couldn't get phrase 0194 QStringLiteral("episodes"), 0195 QStringLiteral("Género"), 0196 QStringLiteral("Sonido"), 0197 QStringLiteral("Color"), 0198 QStringLiteral("Idioma"), 0199 QStringLiteral("Clasificación"), 0200 QStringLiteral("País"), 0201 QStringLiteral("Trama\\s*"), 0202 QString() // reference page doesn't seem to have localized composer 0203 }, { 0204 i18n("Internet Movie Database (German)"), 0205 QStringLiteral("findSectionHeader"), 0206 QStringLiteral("genaue Übereinstimmung"), 0207 QStringLiteral("teilweise Übereinstimmung"), 0208 QStringLiteral("näherungsweise Übereinstimmung"), 0209 QStringLiteral("findSectionHeader"), 0210 QStringLiteral("andere Übereinstimmung"), 0211 QStringLiteral("andere titel"), 0212 QStringLiteral("Regisseur"), 0213 QStringLiteral("Drehbuchautoren"), 0214 QString(), 0215 QStringLiteral("Länge.*(\\d+)\\s+min"), 0216 QStringLiteral("Seitenverhältnis"), 0217 QStringLiteral("Auch bekannt als"), 0218 QStringLiteral("Produktionsfirmen"), 0219 QStringLiteral("Besetzung"), 0220 QStringLiteral("cast overview"), // couldn't get phrase 0221 QStringLiteral("credited cast"), // couldn't get phrase 0222 QStringLiteral("episodes"), 0223 QStringLiteral("Genre"), 0224 QStringLiteral("Tonverfahren"), 0225 QStringLiteral("Farbe"), 0226 QStringLiteral("Sprache"), 0227 QStringLiteral("Altersfreigabe"), 0228 QStringLiteral("Land"), 0229 QStringLiteral("Handlung\\s*"), 0230 QString() // reference page doesn't seem to have localized composer 0231 }, { 0232 i18n("Internet Movie Database (Italian)"), 0233 QStringLiteral("findSectionHeader"), 0234 QStringLiteral("risultati esatti"), 0235 QStringLiteral("risultati parziali"), 0236 QStringLiteral("risultati approssimati"), 0237 QStringLiteral("findSectionHeader"), 0238 QStringLiteral("Resultados Otros"), 0239 QStringLiteral("otro título"), 0240 QStringLiteral("Regista"), 0241 QStringLiteral("Sceneggiatori"), 0242 QString(), 0243 QStringLiteral("Durata.*(\\d+)\\s+min"), 0244 QStringLiteral("Aspect Ratio"), 0245 QStringLiteral("Alias"), 0246 QStringLiteral("Società di produzione"), 0247 QStringLiteral("Cast"), 0248 QStringLiteral("cast overview"), // couldn't get phrase 0249 QStringLiteral("credited cast"), // couldn't get phrase 0250 QStringLiteral("episodes"), 0251 QStringLiteral("Genere"), 0252 QStringLiteral("Sonoro"), 0253 QStringLiteral("Colore"), 0254 QStringLiteral("Lingua"), 0255 QStringLiteral("Divieti"), 0256 QStringLiteral("Nazionalità"), 0257 QStringLiteral("Trama\\s*"), 0258 QString() // reference page doesn't seem to have localized composer 0259 }, { 0260 i18n("Internet Movie Database (Portuguese)"), 0261 QStringLiteral("findSectionHeader"), 0262 QStringLiteral("Exato"), 0263 QStringLiteral("Combinação Parcial"), 0264 QStringLiteral("Combinação Aproximada"), 0265 QStringLiteral("findSectionHeader"), 0266 QStringLiteral("Combinação Otros"), 0267 QStringLiteral("otro título"), 0268 QStringLiteral("Diretor"), 0269 QStringLiteral("Escritores"), 0270 QString(), 0271 QStringLiteral("Duração.*(\\d+)\\s+min"), 0272 QStringLiteral("Resolução"), 0273 QStringLiteral("Também Conhecido Como"), 0274 QStringLiteral("Companhias de Produção"), 0275 QStringLiteral("Elenco"), 0276 QStringLiteral("cast overview"), // couldn't get phrase 0277 QStringLiteral("credited cast"), // couldn't get phrase 0278 QStringLiteral("episodes"), 0279 QStringLiteral("Gênero"), 0280 QStringLiteral("Mixagem de Som"), 0281 QStringLiteral("Cor"), 0282 QStringLiteral("Lingua"), 0283 QStringLiteral("Certificação"), 0284 QStringLiteral("País"), 0285 QStringLiteral("Argumento\\s*"), 0286 QString() // reference page doesn't seem to have localized composer 0287 } 0288 }; 0289 0290 return dataVector[qBound(0, lang_, static_cast<int>(sizeof(dataVector)/sizeof(LangData)))]; 0291 } 0292 0293 IMDBFetcher::IMDBFetcher(QObject* parent_) : Fetcher(parent_), 0294 m_job(nullptr), m_started(false), m_fetchImages(true), 0295 m_numCast(IMDB_DEFAULT_CAST_SIZE), m_redirected(false), m_limit(IMDB_MAX_RESULTS), m_lang(EN), 0296 m_currentTitleBlock(Unknown), m_countOffset(0) { 0297 if(!s_instanceCount++) { 0298 initRegExps(); 0299 } 0300 m_host = QStringLiteral("www.imdb.com"); 0301 } 0302 0303 IMDBFetcher::~IMDBFetcher() { 0304 if(!--s_instanceCount) { 0305 deleteRegExps(); 0306 } 0307 } 0308 0309 QString IMDBFetcher::source() const { 0310 return m_name.isEmpty() ? defaultName() : m_name; 0311 } 0312 0313 bool IMDBFetcher::canFetch(int type) const { 0314 return type == Data::Collection::Video; 0315 } 0316 0317 // imdb can search title only 0318 bool IMDBFetcher::canSearch(Fetch::FetchKey k) const { 0319 return k == Title; 0320 } 0321 0322 void IMDBFetcher::readConfigHook(const KConfigGroup& config_) { 0323 const int lang = config_.readEntry("Lang", int(EN)); 0324 m_lang = static_cast<Lang>(lang); 0325 if(m_name.isEmpty()) { 0326 m_name = langData(m_lang).siteTitle; 0327 } 0328 0329 m_numCast = config_.readEntry("Max Cast", IMDB_DEFAULT_CAST_SIZE); 0330 m_fetchImages = config_.readEntry("Fetch Images", true); 0331 } 0332 0333 // multiple values not supported 0334 void IMDBFetcher::search() { 0335 m_started = true; 0336 m_redirected = false; 0337 0338 m_matches.clear(); 0339 m_popularTitles.clear(); 0340 m_exactTitles.clear(); 0341 m_partialTitles.clear(); 0342 m_currentTitleBlock = Unknown; 0343 m_countOffset = 0; 0344 0345 m_url = QUrl(); 0346 m_url.setScheme(QStringLiteral("https")); 0347 m_url.setHost(m_host); 0348 m_url.setPath(QStringLiteral("/find/")); 0349 0350 // as far as I can tell, the url encoding should always be iso-8859-1? 0351 QUrlQuery q; 0352 q.addQueryItem(QStringLiteral("q"), request().value()); 0353 0354 switch(request().key()) { 0355 case Title: 0356 q.addQueryItem(QStringLiteral("s"), QStringLiteral("tt")); 0357 m_url.setQuery(q); 0358 break; 0359 0360 case Raw: 0361 m_url = QUrl(request().value()); 0362 break; 0363 0364 default: 0365 myWarning() << "not supported:" << request().key(); 0366 stop(); 0367 return; 0368 } 0369 // myDebug() << m_url; 0370 0371 m_job = KIO::storedGet(m_url, KIO::NoReload, KIO::HideProgressInfo); 0372 configureJob(m_job); 0373 connect(m_job.data(), &KJob::result, 0374 this, &IMDBFetcher::slotComplete); 0375 connect(m_job.data(), &KIO::TransferJob::redirection, 0376 this, &IMDBFetcher::slotRedirection); 0377 } 0378 0379 void IMDBFetcher::continueSearch() { 0380 m_started = true; 0381 m_limit += IMDB_MAX_RESULTS; 0382 0383 if(m_currentTitleBlock == Popular) { 0384 parseTitleBlock(m_popularTitles); 0385 // if the offset is 0, then we need to be looking at the next block 0386 m_currentTitleBlock = m_countOffset == 0 ? Exact : Popular; 0387 } 0388 0389 // current title block might have changed 0390 if(m_currentTitleBlock == Exact) { 0391 parseTitleBlock(m_exactTitles); 0392 m_currentTitleBlock = m_countOffset == 0 ? Partial : Exact; 0393 } 0394 0395 if(m_currentTitleBlock == Partial) { 0396 parseTitleBlock(m_partialTitles); 0397 m_currentTitleBlock = m_countOffset == 0 ? Approx : Partial; 0398 } 0399 0400 if(m_currentTitleBlock == Approx) { 0401 parseTitleBlock(m_approxTitles); 0402 m_currentTitleBlock = m_countOffset == 0 ? Unknown : Approx; 0403 } 0404 0405 m_hasMoreResults = false; 0406 stop(); 0407 } 0408 0409 void IMDBFetcher::stop() { 0410 if(!m_started) { 0411 return; 0412 } 0413 if(m_job) { 0414 m_job->kill(); 0415 m_job = nullptr; 0416 } 0417 0418 m_started = false; 0419 m_redirected = false; 0420 0421 emit signalDone(this); 0422 } 0423 0424 void IMDBFetcher::slotRedirection(KIO::Job*, const QUrl& toURL_) { 0425 static const QRegularExpression ttEndRx(QStringLiteral("/tt\\d+/$")); 0426 m_url = toURL_; 0427 if(m_url.path().contains(ttEndRx)) { 0428 m_url.setPath(m_url.path() + QStringLiteral("reference")); 0429 } 0430 m_redirected = true; 0431 } 0432 0433 void IMDBFetcher::slotComplete(KJob*) { 0434 if(m_job->error()) { 0435 myDebug() << m_job->errorString(); 0436 m_job->uiDelegate()->showErrorMessage(); 0437 stop(); 0438 return; 0439 } 0440 0441 m_text = Tellico::fromHtmlData(m_job->data(), "UTF-8"); 0442 if(m_text.isEmpty()) { 0443 myLog() << "No data returned"; 0444 stop(); 0445 return; 0446 } 0447 // see bug 319662. If fetcher is cancelled, job is killed 0448 // if the pointer is retained, it gets double-deleted 0449 m_job = nullptr; 0450 0451 #if 0 0452 myWarning() << "Remove debug from imdbfetcher.cpp for /tmp/testimdbresults.html"; 0453 QFile f(QString::fromLatin1("/tmp/testimdbresults.html")); 0454 if(f.open(QIODevice::WriteOnly)) { 0455 QTextStream t(&f); 0456 t.setCodec("UTF-8"); 0457 t << m_text; 0458 } 0459 f.close(); 0460 #endif 0461 0462 // a single result was found if we got redirected 0463 switch(request().key()) { 0464 case Title: 0465 if(m_redirected) { 0466 parseSingleTitleResult(); 0467 } else { 0468 parseMultipleTitleResults(); 0469 } 0470 break; 0471 0472 case Raw: 0473 parseSingleTitleResult(); 0474 break; 0475 0476 default: 0477 myWarning() << "skipping results"; 0478 break; 0479 } 0480 } 0481 0482 void IMDBFetcher::parseSingleTitleResult() { 0483 s_titleRx->indexIn(Tellico::decodeHTML(m_text)); 0484 // split title at parenthesis 0485 const QString cap1 = s_titleRx->cap(1); 0486 int pPos = cap1.indexOf(QLatin1Char('(')); 0487 // FIXME: maybe remove parentheses here? 0488 FetchResult* r = new FetchResult(this, 0489 pPos == -1 ? cap1 : cap1.left(pPos), 0490 pPos == -1 ? QString() : cap1.mid(pPos)); 0491 // IMDB returns different HTML for single title results and has a query in the url 0492 // clear the query so we download the "canonical" page for the title 0493 QUrl url(m_url); 0494 url.setQuery(QString()); 0495 m_matches.insert(r->uid, url); 0496 m_allMatches.insert(r->uid, url); 0497 emit signalResultFound(r); 0498 0499 m_hasMoreResults = false; 0500 stop(); 0501 } 0502 0503 void IMDBFetcher::parseMultipleTitleResults() { 0504 QString output = Tellico::decodeHTML(m_text); 0505 0506 const LangData& data = langData(m_lang); 0507 // IMDb can return three title lists, popular, exact, and partial 0508 // the popular titles are in the first table 0509 int pos_popular = output.indexOf(data.title_popular, 0, Qt::CaseInsensitive); 0510 int pos_exact = output.indexOf(data.match_exact, qMax(pos_popular, 0), Qt::CaseInsensitive); 0511 int pos_partial = output.indexOf(data.match_partial, qMax(pos_exact, 0), Qt::CaseInsensitive); 0512 int pos_approx = output.indexOf(data.match_approx, qMax(pos_partial, 0), Qt::CaseInsensitive); 0513 0514 int end_popular = pos_exact; // keep track of where to end 0515 if(end_popular == -1) { 0516 end_popular = pos_partial == -1 ? (pos_approx == -1 ? output.length() : pos_approx) : pos_partial; 0517 } 0518 int end_exact = pos_partial; // keep track of where to end 0519 if(end_exact == -1) { 0520 end_exact = pos_approx == -1 ? output.length() : pos_approx; 0521 } 0522 int end_partial = pos_approx; // keep track of where to end 0523 if(end_partial == -1) { 0524 end_partial = output.length(); 0525 } 0526 0527 // if found popular matches 0528 if(pos_popular > -1) { 0529 m_popularTitles = output.mid(pos_popular, end_popular-pos_popular); 0530 } 0531 // if found exact matches 0532 if(pos_exact > -1) { 0533 m_exactTitles = output.mid(pos_exact, end_exact-pos_exact); 0534 } 0535 if(pos_partial > -1) { 0536 m_partialTitles = output.mid(pos_partial, end_partial-pos_partial); 0537 } 0538 if(pos_approx > -1) { 0539 m_approxTitles = output.mid(pos_approx); 0540 } 0541 0542 parseTitleBlock(m_popularTitles); 0543 // if the offset is 0, then we need to be looking at the next block 0544 m_currentTitleBlock = m_countOffset == 0 ? Exact : Popular; 0545 0546 if(m_matches.size() < m_limit) { 0547 parseTitleBlock(m_exactTitles); 0548 m_currentTitleBlock = m_countOffset == 0 ? Partial : Exact; 0549 } 0550 0551 if(m_matches.size() < m_limit) { 0552 parseTitleBlock(m_partialTitles); 0553 m_currentTitleBlock = m_countOffset == 0 ? Approx : Partial; 0554 } 0555 0556 if(m_matches.size() < m_limit) { 0557 parseTitleBlock(m_approxTitles); 0558 m_currentTitleBlock = m_countOffset == 0 ? Unknown : Approx; 0559 } 0560 0561 // last resort 0562 if(m_matches.size() < m_limit) { 0563 const int pos_header = output.indexOf(QStringLiteral("ipc-page-content-container")); 0564 const int end_header = output.indexOf(QStringLiteral("cornerstone"), qMax(0, pos_header)); 0565 if(pos_header > -1) { 0566 parseTitleBlock(output.mid(pos_header, end_header == -1 ? output.length() : end_header)); 0567 } 0568 } 0569 0570 if(m_matches.size() == 0) { 0571 myLog() << "no matches found."; 0572 } 0573 0574 stop(); 0575 } 0576 0577 void IMDBFetcher::parseTitleBlock(const QString& str_) { 0578 if(str_.isEmpty()) { 0579 m_countOffset = 0; 0580 return; 0581 } 0582 0583 static const QRegularExpression akaRx(QStringLiteral("%1 (.*?)(</li>|</td>|<br)").arg(langData(m_lang).aka), 0584 QRegularExpression::CaseInsensitiveOption); 0585 m_hasMoreResults = false; 0586 0587 int count = 0; 0588 int start = s_anchorTitleRx->indexIn(str_); 0589 while(m_started && start > -1) { 0590 // split title at parenthesis 0591 const QString cap1 = s_anchorTitleRx->cap(1); // the anchor url 0592 const QString cap2 = s_anchorTitleRx->cap(2).trimmed(); // the anchor text 0593 start += s_anchorTitleRx->matchedLength(); 0594 int pPos = cap2.indexOf(QLatin1Char('(')); // if it has parentheses, use that for description 0595 QString desc; 0596 if(pPos > -1) { 0597 int pPos2 = cap2.indexOf(QLatin1Char(')'), pPos+1); 0598 if(pPos2 > -1) { 0599 desc = cap2.mid(pPos+1, pPos2-pPos-1); 0600 } 0601 } else { 0602 // parenthesis might be outside anchor tag 0603 int end = s_anchorTitleRx->indexIn(str_, start); 0604 const int end2 = str_.indexOf(QStringLiteral("<img"), start); 0605 const int end3 = str_.indexOf(QStringLiteral("</ul"), start); 0606 if(end2 > -1) end = qMin(end, end2); 0607 if(end3 > -1) end = qMin(end, end3); 0608 if(end == -1) { 0609 end = str_.length(); 0610 } 0611 const QString text = str_.mid(start, end-start); 0612 pPos = text.indexOf(QLatin1Char('(')); 0613 if(pPos > -1) { 0614 const int pNewLine = text.indexOf(QStringLiteral("<br")); 0615 if(pNewLine == -1 || pPos < pNewLine) { 0616 const int pPos2 = text.indexOf(QLatin1Char(')'), pPos); 0617 desc = text.mid(pPos+1, pPos2-pPos-1); 0618 } 0619 // IMDB occasionally has (I) in results. If so, continue parsing string 0620 if(desc == QStringLiteral("I") || desc == QStringLiteral("II")) { 0621 pPos = text.indexOf(QLatin1Char('('), pPos+1); 0622 if(pPos > -1 && (pNewLine == -1 || pPos < pNewLine)) { 0623 const int pPos2 = text.indexOf(QLatin1Char(')'), pPos); 0624 desc = text.mid(pPos+1, pPos2-pPos-1); 0625 } 0626 } 0627 pPos = -1; 0628 } else { 0629 static const QRegularExpression digitsRx(QStringLiteral(">([-–\\d]+)\\s*<")); 0630 QRegularExpressionMatch digitsMatch = digitsRx.match(text); 0631 if(digitsMatch.hasMatch()) { 0632 desc = digitsMatch.captured(1); 0633 } 0634 } 0635 } 0636 auto akaMatch = akaRx.match(str_, start+1, QRegularExpression::NormalMatch); 0637 if(akaMatch.hasMatch()) { 0638 // limit to 50 chars 0639 desc += QLatin1Char(' ') + akaMatch.captured(1).trimmed().remove(*s_tagRx); 0640 if(desc.length() > 50) { 0641 desc = desc.left(50) + QStringLiteral("..."); 0642 } 0643 } 0644 0645 start = s_anchorTitleRx->indexIn(str_, start); 0646 0647 if(count < m_countOffset) { 0648 ++count; 0649 continue; 0650 } 0651 0652 // if we got this far, then there is a valid result 0653 if(m_matches.size() >= m_limit) { 0654 m_hasMoreResults = true; 0655 break; 0656 } 0657 0658 FetchResult* r = new FetchResult(this, pPos == -1 ? cap2 : cap2.left(pPos), desc); 0659 QUrl u = QUrl(m_url).resolved(QUrl(cap1)); 0660 u.setQuery(QString()); 0661 m_matches.insert(r->uid, u); 0662 m_allMatches.insert(r->uid, u); 0663 emit signalResultFound(r); 0664 ++count; 0665 } 0666 if(!m_hasMoreResults && m_currentTitleBlock != Partial) { 0667 m_hasMoreResults = true; 0668 } 0669 m_countOffset = m_matches.size() < m_limit ? 0 : count; 0670 } 0671 0672 Tellico::Data::EntryPtr IMDBFetcher::fetchEntryHook(uint uid_) { 0673 // if we already grabbed this one, then just pull it out of the dict 0674 Data::EntryPtr entry = m_entries[uid_]; 0675 if(entry) { 0676 return entry; 0677 } 0678 0679 if(!m_matches.contains(uid_) && !m_allMatches.contains(uid_)) { 0680 myLog() << "no url found"; 0681 return Data::EntryPtr(); 0682 } 0683 QUrl url = m_matches.contains(uid_) ? m_matches[uid_] 0684 : m_allMatches[uid_]; 0685 static const QRegularExpression ttEndRx(QStringLiteral("/tt\\d+/$")); 0686 if(m_lang == EN && url.path().contains(ttEndRx)) { 0687 url.setPath(url.path() + QStringLiteral("reference")); 0688 } 0689 0690 QUrl origURL = m_url; // keep to switch back 0691 QString results; 0692 // if the url matches the current one, no need to redownload it 0693 if(url == m_url) { 0694 results = Tellico::decodeHTML(m_text); 0695 } else { 0696 // now it's synchronous 0697 // be quiet about failure 0698 QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(url, KIO::NoReload, KIO::HideProgressInfo); 0699 configureJob(getJob); 0700 if(!getJob->exec()) { 0701 myWarning() << "...unable to read" << url; 0702 return Data::EntryPtr(); 0703 } 0704 results = Tellico::fromHtmlData(getJob->data(), "UTF-8"); 0705 m_url = url; // needed for processing 0706 #if 0 0707 myWarning() << "Remove debug from imdbfetcher.cpp for /tmp/testimdbresult.html"; 0708 myDebug() << m_url; 0709 QFile f(QStringLiteral("/tmp/testimdbresult.html")); 0710 if(f.open(QIODevice::WriteOnly)) { 0711 QTextStream t(&f); 0712 t << results; 0713 } 0714 f.close(); 0715 #endif 0716 results = Tellico::decodeHTML(results); 0717 } 0718 if(results.isEmpty()) { 0719 myLog() << "no text results"; 0720 m_url = origURL; 0721 return Data::EntryPtr(); 0722 } 0723 0724 entry = parseEntry(results); 0725 m_url = origURL; 0726 if(!entry) { 0727 myDebug() << "error in processing entry"; 0728 return Data::EntryPtr(); 0729 } 0730 m_entries.insert(uid_, entry); // keep for later 0731 return entry; 0732 } 0733 0734 Tellico::Data::EntryPtr IMDBFetcher::parseEntry(const QString& str_) { 0735 Data::CollPtr coll(new Data::VideoCollection(true)); 0736 Data::EntryPtr entry(new Data::Entry(coll)); 0737 0738 doJson(str_, entry); 0739 0740 doTitle(str_, entry); 0741 doRunningTime(str_, entry); 0742 doAspectRatio(str_, entry); 0743 doAlsoKnownAs(str_, entry); 0744 doPlot(str_, entry, m_url); 0745 if(m_lang == EN) { 0746 doLists(str_, entry); 0747 } else { 0748 doLists2(str_, entry); 0749 } 0750 doStudio(str_, entry); 0751 doPerson(str_, entry, langData(m_lang).director, QStringLiteral("director")); 0752 doPerson(str_, entry, langData(m_lang).writer, QStringLiteral("writer")); 0753 doPerson(str_, entry, langData(m_lang).composer, QStringLiteral("composer")); 0754 doRating(str_, entry); 0755 doCast(str_, entry, m_url); 0756 if(m_fetchImages) { 0757 // needs base URL 0758 doCover(str_, entry, m_url); 0759 } 0760 if(optionalFields().contains(QStringLiteral("episode"))) { 0761 doEpisodes(str_, entry, m_url); 0762 } 0763 0764 const QString imdb = QStringLiteral("imdb"); 0765 if(!coll->hasField(imdb) && optionalFields().contains(imdb)) { 0766 coll->addField(Data::Field::createDefaultField(Data::Field::ImdbField)); 0767 } 0768 if(coll->hasField(imdb) && coll->fieldByName(imdb)->type() == Data::Field::URL) { 0769 m_url.setQuery(QString()); 0770 // we want to strip the "/reference" from the url 0771 QString url = m_url.url(); 0772 if(url.endsWith(QStringLiteral("/reference"))) { 0773 url = m_url.adjusted(QUrl::RemoveFilename).url(); 0774 } 0775 entry->setField(imdb, url); 0776 } 0777 return entry; 0778 } 0779 0780 void IMDBFetcher::doJson(const QString& str_, Tellico::Data::EntryPtr entry_) { 0781 static const QRegularExpression jsonRx(QStringLiteral("<script[^>]+?type=\"application/ld\\+json\".*?>(.+?)</script>")); 0782 QRegularExpressionMatch jsonMatch = jsonRx.match(str_); 0783 if(!jsonMatch.hasMatch()) { 0784 return; 0785 } 0786 0787 QJsonParseError parseError; 0788 QJsonDocument doc = QJsonDocument::fromJson(jsonMatch.captured(1).toUtf8(), &parseError); 0789 if(doc.isNull()) { 0790 myDebug() << "Bad json data:" << parseError.errorString(); 0791 return; 0792 } 0793 0794 QVariantMap objectMap = doc.object().toVariantMap(); 0795 entry_->setField(QStringLiteral("title"), mapValue(objectMap, "name")); 0796 entry_->setField(QStringLiteral("director"), mapValue(objectMap, "director", "name")); 0797 entry_->setField(QStringLiteral("plot"), mapValue(objectMap, "description")); 0798 entry_->setField(QStringLiteral("genre"), mapValue(objectMap, "genre")); 0799 0800 QStringList writers; 0801 foreach(QVariant v, objectMap.value(QStringLiteral("creator")).toList()) { 0802 auto vmap = v.toMap(); 0803 if(vmap.value(QLatin1String("@type")) == QLatin1String("Person")) { 0804 writers += vmap.value(QLatin1String("name")).toString(); 0805 } 0806 } 0807 entry_->setField(QStringLiteral("writer"), writers.join(FieldFormat::delimiterString())); 0808 0809 QString cert = mapValue(objectMap, "contentRating"); 0810 // set default certification, assuming US for now 0811 if(cert == QStringLiteral("Unrated")) { 0812 cert = QLatin1Char('U'); 0813 } 0814 cert += QStringLiteral(" (USA)"); 0815 const QStringList& certsAllowed = entry_->collection()->fieldByName(QStringLiteral("certification"))->allowed(); 0816 if(certsAllowed.contains(cert)) { 0817 entry_->setField(QStringLiteral("certification"), cert); 0818 } 0819 0820 const QString imageUrl = mapValue(objectMap,"image"); 0821 if(!imageUrl.isEmpty()) { 0822 QString id = ImageFactory::addImage(QUrl::fromUserInput(imageUrl), true); 0823 if(!id.isEmpty()) { 0824 entry_->setField(QStringLiteral("cover"), id); 0825 } 0826 } 0827 0828 if(optionalFields().contains(QStringLiteral("imdb-rating"))) { 0829 if(!entry_->collection()->hasField(QStringLiteral("imdb-rating"))) { 0830 Data::FieldPtr f(new Data::Field(QStringLiteral("imdb-rating"), i18n("IMDb Rating"), Data::Field::Rating)); 0831 f->setCategory(i18n("General")); 0832 f->setProperty(QStringLiteral("maximum"), QStringLiteral("10")); 0833 entry_->collection()->addField(f); 0834 } 0835 0836 const QString ratingString = mapValue(objectMap, "aggregateRating", "ratingValue"); 0837 bool ok = true; 0838 float value = ratingString.toFloat(&ok); 0839 if(!ok) { 0840 value = QLocale().toFloat(ratingString, &ok); 0841 } 0842 if(ok) { 0843 entry_->setField(QStringLiteral("imdb-rating"), QString::number(value)); 0844 } 0845 } 0846 } 0847 0848 void IMDBFetcher::doTitle(const QString& str_, Tellico::Data::EntryPtr entry_) { 0849 if(s_titleRx->indexIn(str_) > -1) { 0850 const QString cap1 = s_titleRx->cap(1); 0851 // titles always have parentheses 0852 int pPos = cap1.indexOf(QLatin1Char('(')); 0853 QString title = cap1.left(pPos).trimmed(); 0854 // remove first and last quotes is there 0855 if(title.startsWith(QLatin1Char('"')) && title.endsWith(QLatin1Char('"'))) { 0856 title = title.mid(1, title.length()-2); 0857 } 0858 entry_->setField(QStringLiteral("title"), title); 0859 0860 // now for movies with original non-english titles, the <title> is english 0861 // but the page header is the original title. Grab the orig title 0862 static const QRegularExpression h3TitleRx(QStringLiteral("<h3[^>]+itemprop=\"name\"\\s*>(.*?)<"), 0863 QRegularExpression::DotMatchesEverythingOption); 0864 auto h3Match = h3TitleRx.match(str_); 0865 if(h3Match.hasMatch()) { 0866 QString possibleOrigTitle; 0867 const QString h3Title = h3Match.captured(1).trimmed(); 0868 if(h3Title == title) { 0869 // some tv series have a original title label 0870 static const QRegularExpression origTitleRx(QLatin1String("/h3>(.*?)<span class=\"titlereference-original-title-label"), 0871 QRegularExpression::DotMatchesEverythingOption); 0872 auto origTitleMatch = origTitleRx.match(str_); 0873 if(origTitleMatch.hasMatch()) { 0874 possibleOrigTitle = origTitleMatch.captured(1).trimmed(); 0875 } 0876 } else { 0877 // mis-matching titles. If the user has requested original title, 0878 // put it in origtitle field and keep english as title 0879 // otherwise replace 0880 if(optionalFields().contains(QStringLiteral("origtitle"))) { 0881 possibleOrigTitle = h3Title; 0882 } else { 0883 entry_->setField(QStringLiteral("title"), h3Title); 0884 } 0885 } 0886 if(!possibleOrigTitle.isEmpty() && optionalFields().contains(QStringLiteral("origtitle"))) { 0887 Data::FieldPtr f(new Data::Field(QStringLiteral("origtitle"), i18n("Original Title"))); 0888 f->setFormatType(FieldFormat::FormatTitle); 0889 entry_->collection()->addField(f); 0890 entry_->setField(QStringLiteral("origtitle"), possibleOrigTitle); 0891 } 0892 } 0893 0894 // remove parentheses and extract year, tv shows can have (TV Series 2002-2003) for example 0895 int pPos2 = pPos+1; 0896 // find the closing parenthesis 0897 while(pPos2 < cap1.length() && cap1[pPos2] != QLatin1Char(')')) { 0898 ++pPos2; 0899 } 0900 const auto inParentheses = cap1.midRef(pPos+1, pPos2-pPos-1); 0901 if(!inParentheses.isEmpty()) { 0902 static const QRegularExpression yearRx(QLatin1String("\\d{4}")); // ignore ending year for tv series 0903 auto match = yearRx.match(inParentheses); 0904 if(match.hasMatch()) { 0905 entry_->setField(QStringLiteral("year"), match.captured()); 0906 } 0907 } 0908 } 0909 } 0910 0911 void IMDBFetcher::doRunningTime(const QString& str_, Tellico::Data::EntryPtr entry_) { 0912 // running time 0913 QRegExp runtimeRx(langData(m_lang).runtime, Qt::CaseInsensitive); 0914 runtimeRx.setMinimal(true); 0915 0916 QString text = str_; 0917 text.remove(*s_tagRx); 0918 if(runtimeRx.indexIn(text) > -1) { 0919 if(m_lang == EN) { 0920 entry_->setField(QStringLiteral("running-time"), runtimeRx.cap(1)); 0921 } 0922 else { 0923 const int hours = runtimeRx.cap(1).toInt(); 0924 const int minutes = runtimeRx.cap(2).toInt(); 0925 entry_->setField(QStringLiteral("running-time"), QString::number(hours*60+minutes)); 0926 } 0927 } 0928 } 0929 0930 void IMDBFetcher::doAspectRatio(const QString& str_, Tellico::Data::EntryPtr entry_) { 0931 QRegExp rx(QStringLiteral("%1.*([\\d\\.\\,]+\\s*:\\s*[\\d\\.\\,]+)").arg(langData(m_lang).aspect_ratio), Qt::CaseInsensitive); 0932 rx.setMinimal(true); 0933 0934 if(rx.indexIn(str_) > -1) { 0935 entry_->setField(QStringLiteral("aspect-ratio"), rx.cap(1).trimmed()); 0936 } 0937 } 0938 0939 void IMDBFetcher::doAlsoKnownAs(const QString& str_, Tellico::Data::EntryPtr entry_) { 0940 if(!optionalFields().contains(QStringLiteral("alttitle"))) { 0941 return; 0942 } 0943 0944 // match until next b tag 0945 // QRegExp akaRx(QStringLiteral("also known as(.*)<b(?:\\s.*)?>")); 0946 QRegExp akaRx(QStringLiteral("%1(.*)(<a|<span)[>\\s/]").arg(langData(m_lang).also_known_as), Qt::CaseInsensitive); 0947 akaRx.setMinimal(true); 0948 0949 if(akaRx.indexIn(str_) > -1 && !akaRx.cap(1).isEmpty()) { 0950 Data::FieldPtr f = entry_->collection()->fieldByName(QStringLiteral("alttitle")); 0951 if(!f) { 0952 f = new Data::Field(QStringLiteral("alttitle"), i18n("Alternative Titles"), Data::Field::Table); 0953 f->setFormatType(FieldFormat::FormatTitle); 0954 entry_->collection()->addField(f); 0955 } 0956 0957 // split by </li> 0958 QStringList list = akaRx.cap(1).split(QStringLiteral("</li>")); 0959 // lang could be included with [fr] 0960 // const QRegExp parRx(QStringLiteral("\\(.+\\)")); 0961 const QRegExp brackRx(QStringLiteral("\\[\\w+\\]")); 0962 const QRegExp countryRx(QStringLiteral("\\s*\\(.+\\)\\s*$")); 0963 QStringList values; 0964 for(QStringList::Iterator it = list.begin(); it != list.end(); ++it) { 0965 // sometimes the regexp doesn't work and grabs too much text 0966 // limit to reasonable length 0967 QString s = (*it).left(1000); 0968 // sometimes, the word "more" gets linked to the releaseinfo page, check that 0969 if(s.contains(QStringLiteral("releaseinfo"))) { 0970 continue; 0971 } 0972 s.remove(*s_tagRx); 0973 s.remove(brackRx); 0974 // remove country 0975 s.remove(countryRx); 0976 s.remove(QLatin1Char('"')); 0977 s = s.trimmed(); 0978 // the first value ends up being or starting with the colon after "Also known as" 0979 // I'm too lazy to figure out a better regexp 0980 if(s.startsWith(QLatin1Char(':'))) { 0981 s = s.mid(1); 0982 s = s.trimmed(); 0983 } 0984 if(!s.isEmpty()) { 0985 values += s; 0986 } 0987 } 0988 if(!values.isEmpty()) { 0989 entry_->setField(QStringLiteral("alttitle"), values.join(FieldFormat::rowDelimiterString())); 0990 } 0991 // } else { 0992 // myLog() << "'Also Known As' not found"; 0993 } 0994 } 0995 0996 void IMDBFetcher::doPlot(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) { 0997 if(!entry_->field(QStringLiteral("plot")).isEmpty()) return; 0998 // before using localized plot string, look for DOM component 0999 const QRegularExpression sectionRx(QStringLiteral("<section class=\"titlereference-section-overview\">(.+?)</div"), 1000 QRegularExpression::DotMatchesEverythingOption); 1001 auto sectionMatch = sectionRx.match(str_); 1002 if(sectionMatch.hasMatch()) { 1003 QString thisPlot = sectionMatch.captured(1); 1004 // TV Series include the episode link first, before the plot, so don't be fooled 1005 if(!thisPlot.contains(QLatin1String("<a href"))) { 1006 thisPlot.remove(*s_tagRx); // remove HTML tags 1007 entry_->setField(QStringLiteral("plot"), thisPlot.simplified()); 1008 return; 1009 } 1010 } 1011 1012 // plot summaries provided by users are on a separate page 1013 // should those be preferred? 1014 bool useUserSummary = false; 1015 1016 // match until next <p> tag 1017 QString plotRxStr = langData(m_lang).plot + QStringLiteral("(.*)</(p|div|li)"); 1018 QRegExp plotRx(plotRxStr, Qt::CaseInsensitive); 1019 plotRx.setMinimal(true); 1020 const QRegularExpression plotUrlRx(QStringLiteral("<a\\s+?[^>]*href\\s*=\\s*\"[^\"]*?/title/[^\"]*?/plotsummary\""), 1021 QRegularExpression::CaseInsensitiveOption); 1022 if(plotRx.indexIn(str_) > -1) { 1023 QString thisPlot = plotRx.cap(2); 1024 // if ends with "Written by", remove it. It has an em tag 1025 thisPlot.remove(QRegExp(QStringLiteral("<em class=\"nobr\".*</em>"))); 1026 thisPlot.remove(*s_tagRx); // remove HTML tags 1027 thisPlot = thisPlot.simplified(); 1028 // if thisPlot ends with (more) or contains 1029 // a url that ends with plotsummary, then we'll grab it, otherwise not 1030 if(thisPlot.isEmpty() || 1031 plotRx.cap(0).endsWith(QStringLiteral("(more)</")) || 1032 plotRx.cap(0).contains(plotUrlRx)) { 1033 useUserSummary = true; 1034 } else { 1035 entry_->setField(QStringLiteral("plot"), thisPlot); 1036 } 1037 } else { 1038 useUserSummary = true; 1039 } 1040 1041 if(useUserSummary) { 1042 auto idMatch = s_titleIdRx->match(baseURL_.path()); 1043 Q_ASSERT(idMatch.hasMatch()); 1044 QUrl plotURL = baseURL_; 1045 plotURL.setPath(QStringLiteral("/title/") + idMatch.captured(1) + QStringLiteral("/plotsummary")); 1046 QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(plotURL, KIO::NoReload, KIO::HideProgressInfo); 1047 configureJob(getJob); 1048 if(!getJob->exec()) { 1049 myWarning() << "...unable to read" << plotURL; 1050 } 1051 QString plotPage = Tellico::fromHtmlData(getJob->data(), "UTF-8"); 1052 1053 if(!plotPage.isEmpty()) { 1054 const QRegularExpression plotRx1(QStringLiteral("id=\"plot-summaries-content\">(.+)</p"), 1055 QRegularExpression::DotMatchesEverythingOption); 1056 QString userPlot; 1057 auto plotMatch = plotRx1.match(plotPage); 1058 if(plotMatch.hasMatch()) { 1059 userPlot = plotMatch.captured(1); 1060 } else { 1061 const QRegularExpression plotRx2(QStringLiteral("<div\\s+id\\s*=\\s*\"swiki.2.1\">(.+?)</d"), 1062 QRegularExpression::DotMatchesEverythingOption); 1063 plotMatch = plotRx2.match(plotPage); 1064 if(plotMatch.hasMatch()) { 1065 userPlot = plotMatch.captured(1); 1066 } 1067 } 1068 userPlot.remove(*s_tagRx); // remove HTML tags 1069 // remove last little "written by", if there 1070 userPlot.remove(QRegExp(QStringLiteral("\\s*written by.*$"), Qt::CaseInsensitive)); 1071 if(!userPlot.isEmpty()) { 1072 entry_->setField(QStringLiteral("plot"), Tellico::decodeHTML(userPlot.simplified())); 1073 } 1074 } 1075 } 1076 // myDebug() << "Plot:" << entry_->field(QStringLiteral("plot")); 1077 } 1078 1079 void IMDBFetcher::doStudio(const QString& str_, Tellico::Data::EntryPtr entry_) { 1080 // match until next opening tag 1081 // QRegExp productionRx(langData(m_lang).studio, Qt::CaseInsensitive); 1082 QRegExp productionRx(langData(m_lang).studio); 1083 productionRx.setMinimal(true); 1084 1085 const int pos1 = str_.indexOf(productionRx); 1086 if(pos1 == -1) { 1087 // myLog() << "No studio found"; 1088 return; 1089 } 1090 1091 int pos2 = str_.indexOf(QStringLiteral("blackcatheader"), pos1, Qt::CaseInsensitive); 1092 if(pos2 == -1) { 1093 pos2 = str_.length(); 1094 } 1095 // stop matching when getting to Distributors 1096 int pos3 = str_.indexOf(QStringLiteral("Distributors"), pos1); 1097 if(pos3 > -1 && pos3 < pos2) { 1098 pos2 = pos3; 1099 } 1100 1101 const QString text = str_.mid(pos1, pos2-pos1); 1102 const QString company = QStringLiteral("/company/"); 1103 QStringList studios; 1104 for(int pos = s_anchorRx->indexIn(text); pos > -1; pos = s_anchorRx->indexIn(text, pos+s_anchorRx->matchedLength())) { 1105 const QString cap1 = s_anchorRx->cap(1); 1106 if(cap1.contains(company)) { 1107 studios += s_anchorRx->cap(2).trimmed(); 1108 } 1109 } 1110 1111 entry_->setField(QStringLiteral("studio"), studios.join(FieldFormat::delimiterString())); 1112 } 1113 1114 void IMDBFetcher::doPerson(const QString& str_, Tellico::Data::EntryPtr entry_, 1115 const QString& imdbHeader_, const QString& fieldName_) { 1116 // only read if the field value is currently empty 1117 if(!entry_->field(fieldName_).isEmpty()) return; 1118 QRegExp br2Rx(QStringLiteral("<br[\\s/]*>\\s*<br[\\s/]*>"), Qt::CaseInsensitive); 1119 br2Rx.setMinimal(true); 1120 QRegExp divRx(QStringLiteral("<div\\s[^>]*class\\s*=\\s*\"(?:ipl-header__content|info|txt-block)\"[^>]*>(.*)</table"), Qt::CaseInsensitive); 1121 divRx.setMinimal(true); 1122 1123 const QString name = QStringLiteral("/name/"); 1124 QStringList people; 1125 for(int pos = str_.indexOf(divRx); pos > -1; pos = str_.indexOf(divRx, pos+divRx.matchedLength())) { 1126 const QString infoBlock = divRx.cap(1); 1127 if(infoBlock.contains(imdbHeader_, Qt::CaseInsensitive)) { 1128 int pos2 = s_anchorRx->indexIn(infoBlock); 1129 while(pos2 > -1) { 1130 if(s_anchorRx->cap(1).contains(name)) { 1131 people += s_anchorRx->cap(2).trimmed(); 1132 } 1133 pos2 = s_anchorRx->indexIn(infoBlock, pos2+s_anchorRx->matchedLength()); 1134 } 1135 break; 1136 } 1137 } 1138 if(!people.isEmpty()) { 1139 people.removeDuplicates(); 1140 entry_->setField(fieldName_, people.join(FieldFormat::delimiterString())); 1141 } 1142 } 1143 1144 void IMDBFetcher::doCast(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) { 1145 // the extended cast list is on a separate page 1146 // that's usually a lot of people 1147 // but since it can be in billing order, the main actors might not 1148 // be in the short list 1149 auto idMatch = s_titleIdRx->match(baseURL_.path()); 1150 Q_ASSERT(idMatch.hasMatch()); 1151 QUrl castURL = baseURL_; 1152 castURL.setPath(QStringLiteral("/title/") + idMatch.captured(1) + QStringLiteral("/fullcredits")); 1153 1154 // be quiet about failure and be sure to translate entities 1155 QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(castURL, KIO::NoReload, KIO::HideProgressInfo); 1156 configureJob(getJob); 1157 if(!getJob->exec()) { 1158 myWarning() << "...unable to read" << castURL; 1159 } 1160 const QString castPage = Tellico::decodeHTML(Tellico::fromHtmlData(getJob->data(), "UTF-8")); 1161 #if 0 1162 myWarning() << "Remove debug from imdbfetcher.cpp (/tmp/testimdbcast.html)"; 1163 QFile f(QString::fromLatin1("/tmp/testimdbcast.html")); 1164 if(f.open(QIODevice::WriteOnly)) { 1165 QTextStream t(&f); 1166 t << castPage; 1167 } 1168 f.close(); 1169 #endif 1170 1171 const LangData& data = langData(m_lang); 1172 1173 int pos = -1; 1174 // the text to search, depends on which page is being read 1175 QString castText = castPage; 1176 if(castText.isEmpty()) { 1177 // fall back to short list 1178 castText = str_; 1179 pos = castText.indexOf(data.cast1, 0, Qt::CaseInsensitive); 1180 if(pos == -1) { 1181 pos = castText.indexOf(data.cast2, 0, Qt::CaseInsensitive); 1182 } 1183 } else { 1184 // first look for anchor 1185 QRegExp castAnchorRx(QStringLiteral("<a\\s+name\\s*=\\s*\"cast\""), Qt::CaseInsensitive); 1186 pos = castAnchorRx.indexIn(castText); 1187 if(pos < 0) { 1188 QRegExp tableClassRx(QStringLiteral("<table\\s+class\\s*=\\s*\"cast_list\""), Qt::CaseInsensitive); 1189 pos = tableClassRx.indexIn(castText); 1190 if(pos < 0) { 1191 // fragile, the word "cast" appears in the title, but need to find 1192 // the one right above the actual cast table 1193 // for TV shows, there's a link on the sidebar for "episodes case" 1194 // so need to not match that one 1195 const QString castEnd = data.cast + QStringLiteral("</"); 1196 pos = castText.indexOf(castEnd, 0, Qt::CaseInsensitive); 1197 if(pos > 9) { 1198 // back up 9 places 1199 if(castText.midRef(pos-9, 9).startsWith(data.episodes)) { 1200 // find next cast list 1201 pos = castText.indexOf(castEnd, pos+6, Qt::CaseInsensitive); 1202 } 1203 } 1204 } 1205 } 1206 } 1207 if(pos == -1) { // no cast list found 1208 myLog() << "no cast list found"; 1209 return; 1210 } 1211 // loop until closing table tag 1212 int endPos = castText.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive); 1213 castText = castText.mid(pos, endPos-pos+1); 1214 1215 QStringList actorList, characterList; 1216 QRegularExpression tdActorRx(QStringLiteral("<td>.*?<a href=\"/name.+?\".*?>(.+?)</a"), 1217 QRegularExpression::DotMatchesEverythingOption); 1218 QRegularExpression tdCharRx(QStringLiteral("<td class=\"character\">(.+?)</"), 1219 QRegularExpression::DotMatchesEverythingOption); 1220 1221 QRegularExpressionMatchIterator i = tdActorRx.globalMatch(castText); 1222 while(i.hasNext()) { 1223 QRegularExpressionMatch match = i.next(); 1224 actorList += match.captured(1).simplified(); 1225 } 1226 i = tdCharRx.globalMatch(castText); 1227 while(i.hasNext()) { 1228 QRegularExpressionMatch match = i.next(); 1229 characterList += match.captured(1).remove(*s_tagRx).simplified(); 1230 } 1231 1232 // sanity check 1233 while(characterList.length() > actorList.length()) { 1234 myDebug() << "Too many characters"; 1235 characterList.removeLast(); 1236 } 1237 while(characterList.length() < actorList.length()) { 1238 characterList += QString(); 1239 } 1240 1241 QStringList cast; 1242 cast.reserve(actorList.size()); 1243 for(int i = 0; i < actorList.size(); ++i) { 1244 cast += actorList.at(i) 1245 + FieldFormat::columnDelimiterString() 1246 + characterList.at(i); 1247 if(cast.count() >= m_numCast) { 1248 break; 1249 } 1250 } 1251 1252 if(cast.isEmpty()) { 1253 QRegExp tdRx(QStringLiteral("<td[^>]*>(.*)</td>"), Qt::CaseInsensitive); 1254 tdRx.setMinimal(true); 1255 1256 QRegExp tdActorRx(QStringLiteral("<td\\s+[^>]*itemprop=\"actor\"[^>]*>(.*)</td>"), Qt::CaseInsensitive); 1257 tdActorRx.setMinimal(true); 1258 1259 QRegExp tdCharRx(QStringLiteral("<td\\s+[^>]*class=\"character\"[^>]*>(.*)</td>"), Qt::CaseInsensitive); 1260 tdCharRx.setMinimal(true); 1261 1262 pos = tdActorRx.indexIn(castText); 1263 while(pos > -1 && cast.count() < m_numCast) { 1264 QString actorText = tdActorRx.cap(1).remove(*s_tagRx).simplified(); 1265 const int pos2 = tdCharRx.indexIn(castText, pos+1); 1266 if(pos2 > -1) { 1267 cast += actorText 1268 + FieldFormat::columnDelimiterString() 1269 + tdCharRx.cap(1).remove(*s_tagRx).simplified(); 1270 } 1271 pos = tdActorRx.indexIn(castText, qMax(pos+1, pos2)); 1272 } 1273 } 1274 1275 if(!cast.isEmpty()) { 1276 entry_->setField(QStringLiteral("cast"), cast.join(FieldFormat::rowDelimiterString())); 1277 } 1278 1279 // also do other items from fullcredits page, like producer 1280 pos = castPage.indexOf(QLatin1String("id=\"producer\""), 0, Qt::CaseInsensitive); 1281 if(pos > -1) { 1282 int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive); 1283 if(endPos == -1) { 1284 endPos = castPage.length(); 1285 } 1286 const QString prodText = castPage.mid(pos, endPos-pos+1); 1287 QRegExp tdCharRx(QStringLiteral("<td\\s+[^>]*class=\"credit\"[^>]*>(.*)</td>")); 1288 tdCharRx.setMinimal(true); 1289 1290 QStringList producers; 1291 pos = s_anchorNameRx->indexIn(prodText); 1292 while(pos > -1 && producers.count() < IMDB_MAX_PERSON_COUNT) { 1293 const int pos2 = tdCharRx.indexIn(prodText, pos+1); 1294 const QString credit = tdCharRx.cap(1).trimmed(); 1295 if(pos2 > -1 && (credit.startsWith(QStringLiteral("producer")) || 1296 credit.startsWith(QStringLiteral("co-producer")) || 1297 credit.startsWith(QStringLiteral("associate producer")))) { 1298 producers += s_anchorNameRx->cap(2).trimmed(); 1299 } 1300 pos = s_anchorNameRx->indexIn(prodText, pos+1); 1301 } 1302 if(!producers.isEmpty()) { 1303 entry_->setField(QStringLiteral("producer"), producers.join(FieldFormat::delimiterString())); 1304 } 1305 } 1306 1307 const QString director = QStringLiteral("director"); 1308 // only try to read director if its already empty, which means it wasn't found on main page 1309 if(entry_->field(director).isEmpty()) { 1310 QStringList directors; 1311 pos = castPage.indexOf(QLatin1String("id=\"director\""), 0, Qt::CaseInsensitive); 1312 if(pos > -1 && directors.count() < IMDB_MAX_PERSON_COUNT) { 1313 int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive); 1314 if(endPos == -1) { 1315 endPos = castPage.length(); 1316 } 1317 const QString midText = castPage.mid(pos, endPos-pos+1); 1318 pos = s_anchorNameRx->indexIn(midText); 1319 while(pos > -1) { 1320 directors += s_anchorNameRx->cap(2).trimmed(); 1321 pos = s_anchorNameRx->indexIn(midText, pos+1); 1322 } 1323 } 1324 if(!directors.isEmpty()) { 1325 entry_->setField(director, directors.join(FieldFormat::delimiterString())); 1326 } 1327 } 1328 1329 const QString writer = QStringLiteral("writer"); 1330 // only try to read director if its already empty, which means it wasn't found on main page 1331 if(entry_->field(writer).isEmpty()) { 1332 QStringList writers; 1333 pos = castPage.indexOf(QLatin1String("id=\"writer\""), 0, Qt::CaseInsensitive); 1334 if(pos > -1 && writers.count() < IMDB_MAX_PERSON_COUNT) { 1335 int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive); 1336 if(endPos == -1) { 1337 endPos = castPage.length(); 1338 } 1339 const QString midText = castPage.mid(pos, endPos-pos+1); 1340 pos = s_anchorNameRx->indexIn(midText); 1341 while(pos > -1) { 1342 writers += s_anchorNameRx->cap(2).trimmed(); 1343 pos = s_anchorNameRx->indexIn(midText, pos+1); 1344 } 1345 } 1346 writers.removeDuplicates(); // some editor/writer duplicates 1347 if(!writers.isEmpty()) { 1348 entry_->setField(writer, writers.join(FieldFormat::delimiterString())); 1349 } 1350 } 1351 1352 const QString composer = QStringLiteral("composer"); 1353 // only try to read director if its already empty, which means it wasn't found on main page 1354 if(entry_->field(composer).isEmpty()) { 1355 QStringList composers; 1356 pos = castPage.indexOf(QLatin1String("id=\"composer\""), 0, Qt::CaseInsensitive); 1357 if(pos > -1 && composers.count() < IMDB_MAX_PERSON_COUNT) { 1358 int endPos = castPage.indexOf(QStringLiteral("</table"), pos, Qt::CaseInsensitive); 1359 if(endPos == -1) { 1360 endPos = castPage.length(); 1361 } 1362 const QString midText = castPage.mid(pos, endPos-pos+1); 1363 pos = s_anchorNameRx->indexIn(midText); 1364 while(pos > -1) { 1365 composers += s_anchorNameRx->cap(2).trimmed(); 1366 pos = s_anchorNameRx->indexIn(midText, pos+1); 1367 } 1368 } 1369 if(!composers.isEmpty()) { 1370 entry_->setField(composer, composers.join(FieldFormat::delimiterString())); 1371 } 1372 } 1373 } 1374 1375 void IMDBFetcher::doRating(const QString& str_, Tellico::Data::EntryPtr entry_) { 1376 if(!optionalFields().contains(QStringLiteral("imdb-rating"))) { 1377 return; 1378 } 1379 1380 QRegExp divRx(QStringLiteral("<div class=\"ipl-rating-star[\\s\"]+>(.*)</div"), Qt::CaseInsensitive); 1381 divRx.setMinimal(true); 1382 1383 if(divRx.indexIn(str_) > -1) { 1384 if(!entry_->collection()->hasField(QStringLiteral("imdb-rating"))) { 1385 Data::FieldPtr f(new Data::Field(QStringLiteral("imdb-rating"), i18n("IMDb Rating"), Data::Field::Rating)); 1386 f->setCategory(i18n("General")); 1387 f->setProperty(QStringLiteral("maximum"), QStringLiteral("10")); 1388 entry_->collection()->addField(f); 1389 } 1390 1391 QString text = divRx.cap(0); 1392 text.remove(*s_tagRx); 1393 1394 QRegExp ratingRx(QStringLiteral("\\s(\\d+.?\\d*)\\s")); 1395 if(ratingRx.indexIn(text) > -1) { 1396 bool ok; 1397 float value = ratingRx.cap(1).toFloat(&ok); 1398 if(!ok) { 1399 value = QLocale().toFloat(ratingRx.cap(1), &ok); 1400 } 1401 if(ok) { 1402 entry_->setField(QStringLiteral("imdb-rating"), QString::number(value)); 1403 } 1404 } 1405 } 1406 } 1407 1408 void IMDBFetcher::doCover(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) { 1409 QRegExp imgRx(QStringLiteral("<img\\s+[^>]*src\\s*=\\s*\"([^\"]*)\"[^>]*>"), Qt::CaseInsensitive); 1410 imgRx.setMinimal(true); 1411 1412 QRegExp posterRx(QStringLiteral("<a\\s+[^>]*name\\s*=\\s*\"poster\"[^>]*>(.*)</a>"), Qt::CaseInsensitive); 1413 posterRx.setMinimal(true); 1414 1415 const QString cover = QStringLiteral("cover"); 1416 1417 int pos = posterRx.indexIn(str_); 1418 while(pos > -1) { 1419 if(posterRx.cap(1).contains(imgRx)) { 1420 QUrl u = QUrl(baseURL_).resolved(QUrl(imgRx.cap(1))); 1421 QString id = ImageFactory::addImage(u, true); 1422 if(!id.isEmpty()) { 1423 entry_->setField(cover, id); 1424 return; 1425 } 1426 } 1427 pos = posterRx.indexIn(str_, pos+posterRx.matchedLength()); 1428 } 1429 1430 // <link rel='image_src' 1431 const QRegularExpression linkRx(QStringLiteral("<link (.+?)>")); 1432 const QRegularExpression hrefRx(QStringLiteral("href=['\"](.+?)['\"]")); 1433 1434 const QString src = QStringLiteral("image_src"); 1435 auto i = linkRx.globalMatch(str_); 1436 while(i.hasNext()) { 1437 auto match = i.next(); 1438 const auto tag = match.capturedRef(1); 1439 if(tag.contains(src, Qt::CaseInsensitive)) { 1440 auto hrefMatch = hrefRx.match(tag); 1441 if(hrefMatch.hasMatch()) { 1442 QUrl u = QUrl(baseURL_).resolved(QUrl(hrefMatch.captured(1))); 1443 // imdb uses amazon media image, where the img src "encodes" requests for image sizing and cropping 1444 // strip everything after the "@." and add UY64 to limit the max image dimension to 640 1445 int n = u.url().indexOf(QStringLiteral("@.")); 1446 if(n > -1) { 1447 const QString newLink = u.url().left(n) + QStringLiteral("@.UY640.jpg"); 1448 const QString id = ImageFactory::addImage(QUrl(newLink), true); 1449 if(!id.isEmpty()) { 1450 entry_->setField(cover, id); 1451 return; 1452 } 1453 } 1454 const QString id = ImageFactory::addImage(u, true); 1455 if(!id.isEmpty()) { 1456 entry_->setField(cover, id); 1457 return; 1458 } 1459 } 1460 } 1461 } 1462 1463 // <img alt="poster" 1464 posterRx.setPattern(QStringLiteral("<img\\s+[^>]*alt\\s*=\\s*\"poster\"[^>]+src\\s*=\\s*\"([^\"]+)\"")); 1465 pos = posterRx.indexIn(str_); 1466 if(pos > -1) { 1467 QUrl u = QUrl(baseURL_).resolved(QUrl(posterRx.cap(1))); 1468 QString id = ImageFactory::addImage(u, true); 1469 if(!id.isEmpty()) { 1470 entry_->setField(cover, id); 1471 return; 1472 } 1473 } 1474 1475 // didn't find the cover, IMDb also used to put "cover" inside the url 1476 // cover is the img with the "cover" alt text 1477 pos = imgRx.indexIn(str_); 1478 while(pos > -1) { 1479 const QString url = imgRx.cap(0).toLower(); 1480 if(url.contains(cover)) { 1481 QUrl u = QUrl(baseURL_).resolved(QUrl(imgRx.cap(1))); 1482 QString id = ImageFactory::addImage(u, true); 1483 if(!id.isEmpty()) { 1484 entry_->setField(cover, id); 1485 return; 1486 } 1487 } 1488 pos = imgRx.indexIn(str_, pos+imgRx.matchedLength()); 1489 } 1490 } 1491 1492 void IMDBFetcher::doLists2(const QString& str_, Tellico::Data::EntryPtr entry_) { 1493 QRegExp divInfoRx(QStringLiteral("<li role=\"presentation\".*>(.*)</div"), Qt::CaseInsensitive); 1494 divInfoRx.setMinimal(true); 1495 1496 const LangData& data = langData(m_lang); 1497 1498 QStringList genres, countries, langs, certs, tracks; 1499 for(int pos = divInfoRx.indexIn(str_); pos > -1; pos = divInfoRx.indexIn(str_, pos+divInfoRx.matchedLength())) { 1500 QString divMatch = divInfoRx.cap(1); 1501 int pos2 = 0; 1502 if((pos2=s_anchorRx->indexIn(divMatch)) == -1) continue; 1503 const QString text = divMatch.remove(*s_tagRx); 1504 QString value = s_anchorRx->cap(2); 1505 1506 if(text.startsWith(data.genre)) { 1507 foreach(const QString& token, value.split(QLatin1Char('|'))) { 1508 genres << token.trimmed(); 1509 } 1510 } else if(text.startsWith(data.language)) { 1511 foreach(const QString& token, value.split(QRegExp(QLatin1String("[,|]")))) { 1512 langs << token.trimmed(); 1513 } 1514 } else if(text.startsWith(data.sound)) { 1515 foreach(const QString& token, value.split(QLatin1Char('|'))) { 1516 tracks << token.trimmed(); 1517 } 1518 } else if(text.startsWith(data.country)) { 1519 countries << value; 1520 } else if(text.startsWith(data.certification)) { 1521 foreach(const QString& token, value.split(QLatin1Char('|'))) { 1522 certs << token.trimmed(); 1523 } 1524 } else if(text.startsWith(data.color)) { 1525 // cut off any parentheses 1526 value = value.section(QLatin1Char('('), 0, 0).trimmed(); 1527 // change "black and white" to "black & white" 1528 value.replace(QStringLiteral("and"), QStringLiteral("&")); 1529 if(value == data.color) { 1530 entry_->setField(QStringLiteral("color"), i18n("Color")); 1531 } else { 1532 entry_->setField(QStringLiteral("color"), value); 1533 } 1534 } 1535 } 1536 1537 if(!genres.isEmpty()) { 1538 entry_->setField(QStringLiteral("genre"), genres.join(FieldFormat::delimiterString())); 1539 } 1540 if(!countries.isEmpty()) { 1541 entry_->setField(QStringLiteral("nationality"), countries.join(FieldFormat::delimiterString())); 1542 } 1543 if(!langs.isEmpty()) { 1544 entry_->setField(QStringLiteral("language"), langs.join(FieldFormat::delimiterString())); 1545 } 1546 if(!tracks.isEmpty()) { 1547 entry_->setField(QStringLiteral("audio-track"), tracks.join(FieldFormat::delimiterString())); 1548 } 1549 if(!certs.isEmpty()) { 1550 // first try to set default certification 1551 const QStringList& certsAllowed = entry_->collection()->fieldByName(QStringLiteral("certification"))->allowed(); 1552 foreach(const QString& cert, certs) { 1553 QString country = cert.section(QLatin1Char(':'), 0, 0); 1554 QString lcert = cert.section(QLatin1Char(':'), 1, 1); 1555 if(lcert == QStringLiteral("Unrated")) { 1556 lcert = QLatin1Char('U'); 1557 } 1558 lcert += QStringLiteral(" (") + country + QLatin1Char(')'); 1559 if(certsAllowed.contains(lcert)) { 1560 entry_->setField(QStringLiteral("certification"), lcert); 1561 break; 1562 } 1563 } 1564 1565 // now add new field for all certifications 1566 const QString allc = QStringLiteral("allcertification"); 1567 if(optionalFields().contains(allc)) { 1568 Data::FieldPtr f = entry_->collection()->fieldByName(allc); 1569 if(!f) { 1570 f = new Data::Field(allc, i18n("Certifications"), Data::Field::Table); 1571 f->setFlags(Data::Field::AllowGrouped); 1572 entry_->collection()->addField(f); 1573 } 1574 entry_->setField(QStringLiteral("allcertification"), certs.join(FieldFormat::rowDelimiterString())); 1575 } 1576 } 1577 } 1578 1579 // look at every anchor tag in the string 1580 void IMDBFetcher::doLists(const QString& str_, Tellico::Data::EntryPtr entry_) { 1581 const QString genre = QStringLiteral("/Genres/"); 1582 const QString genre2 = QStringLiteral("/genre/"); 1583 const QString country = QStringLiteral("/country/"); 1584 const QString lang = QStringLiteral("/language/"); 1585 const QString colorInfo = QStringLiteral("colors="); 1586 const QString cert = QStringLiteral("certificates="); 1587 const QString soundMix = QStringLiteral("sound_mixes="); 1588 const QString year = QStringLiteral("/Years/"); 1589 1590 // if we reach faqs or user comments, we can stop 1591 const QString faqs = QStringLiteral("/faq"); 1592 const QString users = QStringLiteral("/user/"); 1593 // IMdb also has links with the word "sections" in them, remove that 1594 // for genres and nationalities 1595 1596 int startPos = str_.indexOf(QStringLiteral("<div id=\"pagecontent\">")); 1597 if(startPos == -1) { 1598 startPos = 0; 1599 } 1600 1601 QStringList genres, countries, langs, certs, tracks; 1602 for(int pos = s_anchorRx->indexIn(str_, startPos); pos > -1; pos = s_anchorRx->indexIn(str_, pos+s_anchorRx->matchedLength())) { 1603 const QString cap1 = s_anchorRx->cap(1); 1604 if(cap1.contains(genre) || cap1.contains(genre2)) { 1605 const QString g = s_anchorRx->cap(2); 1606 if(!g.contains(QStringLiteral(" section"), Qt::CaseInsensitive) && 1607 !g.contains(QStringLiteral(" genre"), Qt::CaseInsensitive)) { 1608 // ignore "Most Popular by Genre" 1609 genres += g.trimmed(); 1610 } 1611 } else if(cap1.contains(country)) { 1612 if(!s_anchorRx->cap(2).contains(QStringLiteral(" section"), Qt::CaseInsensitive)) { 1613 countries += s_anchorRx->cap(2).trimmed(); 1614 } 1615 } else if(cap1.contains(lang) && !cap1.contains(QStringLiteral("contribute"))) { 1616 langs += s_anchorRx->cap(2).trimmed(); 1617 } else if(cap1.contains(colorInfo)) { 1618 QString value = s_anchorRx->cap(2); 1619 // cut off any parentheses 1620 value = value.section(QLatin1Char('('), 0, 0).trimmed(); 1621 // change "black and white" to "black & white" 1622 value.replace(QStringLiteral("and"), QStringLiteral("&")); 1623 entry_->setField(QStringLiteral("color"), value.trimmed()); 1624 } else if(cap1.contains(cert)) { 1625 certs += s_anchorRx->cap(2).trimmed(); 1626 } else if(cap1.contains(soundMix)) { 1627 tracks += s_anchorRx->cap(2).trimmed(); 1628 // if year field wasn't set before, do it now 1629 } else if(entry_->field(QStringLiteral("year")).isEmpty() && cap1.contains(year)) { 1630 entry_->setField(QStringLiteral("year"), s_anchorRx->cap(2).trimmed()); 1631 } else if((cap1.contains(faqs) || cap1.contains(users)) && !genres.isEmpty()) { 1632 break; 1633 } 1634 } 1635 1636 // since we have multiple genre search strings 1637 genres.removeDuplicates(); 1638 1639 entry_->setField(QStringLiteral("genre"), genres.join(FieldFormat::delimiterString())); 1640 entry_->setField(QStringLiteral("nationality"), countries.join(FieldFormat::delimiterString())); 1641 entry_->setField(QStringLiteral("language"), langs.join(FieldFormat::delimiterString())); 1642 entry_->setField(QStringLiteral("audio-track"), tracks.join(FieldFormat::delimiterString())); 1643 if(!certs.isEmpty()) { 1644 // first try to set default certification 1645 const QStringList& certsAllowed = entry_->collection()->fieldByName(QStringLiteral("certification"))->allowed(); 1646 foreach(const QString& cert, certs) { 1647 QString country = cert.section(QLatin1Char(':'), 0, 0); 1648 if(country == QStringLiteral("United States")) { 1649 country = QStringLiteral("USA"); 1650 } 1651 QString lcert = cert.section(QLatin1Char(':'), 1, 1); 1652 if(lcert == QStringLiteral("Unrated")) { 1653 lcert = QLatin1Char('U'); 1654 } 1655 lcert += QStringLiteral(" (") + country + QLatin1Char(')'); 1656 if(certsAllowed.contains(lcert)) { 1657 entry_->setField(QStringLiteral("certification"), lcert); 1658 break; 1659 } 1660 } 1661 1662 // now add new field for all certifications 1663 const QString allc = QStringLiteral("allcertification"); 1664 if(optionalFields().contains(allc)) { 1665 Data::FieldPtr f = entry_->collection()->fieldByName(allc); 1666 if(!f) { 1667 f = new Data::Field(allc, i18n("Certifications"), Data::Field::Table); 1668 f->setFlags(Data::Field::AllowGrouped); 1669 entry_->collection()->addField(f); 1670 } 1671 entry_->setField(QStringLiteral("allcertification"), certs.join(FieldFormat::rowDelimiterString())); 1672 } 1673 } 1674 } 1675 1676 void IMDBFetcher::doEpisodes(const QString& str_, Tellico::Data::EntryPtr entry_, const QUrl& baseURL_) { 1677 if(!str_.contains(QStringLiteral("video.tv_show"))) { 1678 // depend on meta data to indicate TV series 1679 // should include <meta property='og:type' content="video.tv_show" /> in the reference view 1680 return; 1681 } 1682 const QString episode = QStringLiteral("episode"); 1683 if(!entry_->collection()->hasField(episode)) { 1684 entry_->collection()->addField(Data::Field::createDefaultField(Data::Field::EpisodeField)); 1685 } 1686 1687 int currentSeason = 1; 1688 int totalSeasons = -1; 1689 QStringList episodes; 1690 1691 // the episode list is on a separate page 1692 auto idMatch = s_titleIdRx->match(baseURL_.path()); 1693 Q_ASSERT(idMatch.hasMatch()); 1694 1695 const QRegularExpression episodeRx(QStringLiteral("itemtype=\"http://schema.org/TVEpisode\"")); 1696 const QRegularExpression anchorEpisodeRx(QStringLiteral("<a href=\"/title/.+?_ep(\\d+)\"\\s+title=\"(.+?)\""), 1697 QRegularExpression::DotMatchesEverythingOption); 1698 QUrl episodeUrl = baseURL_; 1699 episodeUrl.setPath(QStringLiteral("/title/") + idMatch.captured(1) + QStringLiteral("/episodes/_ajax")); 1700 QUrlQuery q; 1701 // loop over the total number of seasons 1702 do { 1703 q.clear(); 1704 q.addQueryItem(QLatin1String("season"), QString::number(currentSeason)); 1705 episodeUrl.setQuery(q); 1706 1707 QPointer<KIO::StoredTransferJob> getJob = KIO::storedGet(episodeUrl, KIO::NoReload, KIO::HideProgressInfo); 1708 configureJob(getJob); 1709 if(!getJob->exec()) { 1710 myWarning() << "...unable to read" << episodeUrl; 1711 } 1712 const QString episodeText = Tellico::fromHtmlData(getJob->data(), "UTF-8"); 1713 #if 0 1714 myWarning() << "Remove debug from imdbfetcher.cpp (/tmp/testimdbepisodes.html)"; 1715 QFile f(QString::fromLatin1("/tmp/testimdbepisodes.html")); 1716 if(f.open(QIODevice::WriteOnly)) { 1717 QTextStream t(&f); 1718 t << castPage; 1719 } 1720 f.close(); 1721 #endif 1722 1723 if(totalSeasons == -1) { 1724 // assume never more than 99 seasons, alternative is 4-digit years 1725 static const QRegularExpression optionRx(QStringLiteral("<option\\s+value=\"(\\d\\d?)\"")); 1726 auto iOption = optionRx.globalMatch(episodeText); 1727 while(iOption.hasNext()) { 1728 auto optionMatch = iOption.next(); 1729 const int value = optionMatch.captured(1).toInt(); 1730 if(value > totalSeasons) totalSeasons = value; 1731 } 1732 totalSeasons = qMin(totalSeasons, IMDB_MAX_SEASON_COUNT); 1733 // ok if totalSeasons remains == -1 1734 // myDebug() << "Total seasons:" << totalSeasons; 1735 } 1736 1737 auto i = episodeRx.globalMatch(episodeText); 1738 while(i.hasNext()) { 1739 auto match = i.next(); 1740 auto anchorMatch = anchorEpisodeRx.match(episodeText, match.capturedEnd()); 1741 if(anchorMatch.hasMatch()) { 1742 // myDebug() << "found episode" << anchorMatch.captured(1) << anchorMatch.captured(2); 1743 episodes << anchorMatch.captured(2) + FieldFormat::columnDelimiterString() + 1744 QString::number(currentSeason) + FieldFormat::columnDelimiterString() + 1745 anchorMatch.captured(1); 1746 } 1747 } 1748 ++currentSeason; 1749 } while (totalSeasons > 0 && currentSeason < totalSeasons); 1750 1751 entry_->setField(episode, episodes.join(FieldFormat::rowDelimiterString())); 1752 } 1753 1754 Tellico::Fetch::FetchRequest IMDBFetcher::updateRequest(Data::EntryPtr entry_) { 1755 QUrl link = QUrl::fromUserInput(entry_->field(QStringLiteral("imdb"))); 1756 1757 if(!link.isEmpty() && link.isValid()) { 1758 if(link.host() != m_host) { 1759 // myLog() << "switching hosts to " << m_host; 1760 link.setHost(m_host); 1761 } 1762 return FetchRequest(Fetch::Raw, link.url()); 1763 } 1764 1765 // optimistically try searching for title and rely on Collection::sameEntry() to figure things out 1766 const QString t = entry_->field(QStringLiteral("title")); 1767 if(!t.isEmpty()) { 1768 return FetchRequest(Fetch::Title, t); 1769 } 1770 return FetchRequest(); 1771 } 1772 1773 void IMDBFetcher::configureJob(QPointer<KIO::StoredTransferJob> job_) { 1774 KJobWidgets::setWindow(job_, GUI::Proxy::widget()); 1775 switch(m_lang) { 1776 case EN: 1777 job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("en-US")); break; 1778 case FR: 1779 job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("fr-FR")); break; 1780 case ES: 1781 job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("es-ES")); break; 1782 case DE: 1783 job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("de-DE")); break; 1784 case IT: 1785 job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("it-IT")); break; 1786 case PT: 1787 job_->addMetaData(QStringLiteral("Languages"), QStringLiteral("pt-PT")); break; 1788 } 1789 } 1790 1791 QString IMDBFetcher::defaultName() { 1792 return i18n("Internet Movie Database"); 1793 } 1794 1795 QString IMDBFetcher::defaultIcon() { 1796 return favIcon("https://www.imdb.com"); 1797 } 1798 1799 //static 1800 Tellico::StringHash IMDBFetcher::allOptionalFields() { 1801 StringHash hash; 1802 hash[QStringLiteral("imdb")] = i18n("IMDb Link"); 1803 hash[QStringLiteral("imdb-rating")] = i18n("IMDb Rating"); 1804 hash[QStringLiteral("alttitle")] = i18n("Alternative Titles"); 1805 hash[QStringLiteral("allcertification")] = i18n("Certifications"); 1806 hash[QStringLiteral("origtitle")] = i18n("Original Title"); 1807 hash[QStringLiteral("episode")] = i18n("Episodes"); 1808 return hash; 1809 } 1810 1811 Tellico::Fetch::ConfigWidget* IMDBFetcher::configWidget(QWidget* parent_) const { 1812 return new IMDBFetcher::ConfigWidget(parent_, this); 1813 } 1814 1815 IMDBFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const IMDBFetcher* fetcher_/*=0*/) 1816 : Fetch::ConfigWidget(parent_) { 1817 QGridLayout* l = new QGridLayout(optionsWidget()); 1818 l->setSpacing(4); 1819 l->setColumnStretch(1, 10); 1820 1821 int row = -1; 1822 1823 QLabel* label = new QLabel(i18n("&Maximum cast: "), optionsWidget()); 1824 l->addWidget(label, ++row, 0); 1825 m_numCast = new QSpinBox(optionsWidget()); 1826 m_numCast->setMaximum(99); 1827 m_numCast->setMinimum(0); 1828 m_numCast->setValue(IMDB_DEFAULT_CAST_SIZE); 1829 #if (QT_VERSION < QT_VERSION_CHECK(5, 14, 0)) 1830 void (QSpinBox::* textChanged)(const QString&) = &QSpinBox::valueChanged; 1831 #else 1832 void (QSpinBox::* textChanged)(const QString&) = &QSpinBox::textChanged; 1833 #endif 1834 connect(m_numCast, textChanged, this, &ConfigWidget::slotSetModified); 1835 l->addWidget(m_numCast, row, 1); 1836 QString w = i18n("The list of cast members may include many people. Set the maximum number returned from the search."); 1837 label->setWhatsThis(w); 1838 m_numCast->setWhatsThis(w); 1839 label->setBuddy(m_numCast); 1840 1841 m_fetchImageCheck = new QCheckBox(i18n("Download cover &image"), optionsWidget()); 1842 connect(m_fetchImageCheck, &QAbstractButton::clicked, this, &ConfigWidget::slotSetModified); 1843 ++row; 1844 l->addWidget(m_fetchImageCheck, row, 0, 1, 2); 1845 w = i18n("The cover image may be downloaded as well. However, too many large images in the " 1846 "collection may degrade performance."); 1847 m_fetchImageCheck->setWhatsThis(w); 1848 1849 l->setRowStretch(++row, 10); 1850 1851 // now add additional fields widget 1852 addFieldsWidget(IMDBFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList()); 1853 KAcceleratorManager::manage(optionsWidget()); 1854 1855 if(fetcher_) { 1856 m_numCast->setValue(fetcher_->m_numCast); 1857 m_fetchImageCheck->setChecked(fetcher_->m_fetchImages); 1858 } else { //defaults 1859 m_fetchImageCheck->setChecked(true); 1860 } 1861 } 1862 1863 void IMDBFetcher::ConfigWidget::saveConfigHook(KConfigGroup& config_) { 1864 config_.writeEntry("Host", QString()); // clear old host entry 1865 config_.writeEntry("Max Cast", m_numCast->value()); 1866 config_.writeEntry("Fetch Images", m_fetchImageCheck->isChecked()); 1867 } 1868 1869 QString IMDBFetcher::ConfigWidget::preferredName() const { 1870 return IMDBFetcher::langData(EN).siteTitle; 1871 } 1872 1873 void IMDBFetcher::ConfigWidget::slotSiteChanged() { 1874 emit signalName(preferredName()); 1875 }