File indexing completed on 2024-05-12 05:09:38
0001 /*************************************************************************** 0002 Copyright (C) 2017 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "kinoteatrfetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../utils/string_utils.h" 0028 #include "../collections/videocollection.h" 0029 #include "../entry.h" 0030 #include "../fieldformat.h" 0031 #include "../core/filehandler.h" 0032 #include "../images/imagefactory.h" 0033 #include "../tellico_debug.h" 0034 0035 #include <KLocalizedString> 0036 #include <KIO/Job> 0037 #include <KJobUiDelegate> 0038 #include <KJobWidgets/KJobWidgets> 0039 0040 #include <QRegularExpression> 0041 #include <QLabel> 0042 #include <QFile> 0043 #include <QTextStream> 0044 #include <QVBoxLayout> 0045 #include <QUrlQuery> 0046 0047 namespace { 0048 static const char* KINOTEATR_SEARCH_URL = "https://kino-teatr.ua/uk/main/films.phtml"; 0049 } 0050 0051 using namespace Tellico; 0052 using Tellico::Fetch::KinoTeatrFetcher; 0053 0054 KinoTeatrFetcher::KinoTeatrFetcher(QObject* parent_) 0055 : Fetcher(parent_), m_started(false) { 0056 } 0057 0058 KinoTeatrFetcher::~KinoTeatrFetcher() { 0059 } 0060 0061 QString KinoTeatrFetcher::source() const { 0062 return m_name.isEmpty() ? defaultName() : m_name; 0063 } 0064 0065 bool KinoTeatrFetcher::canFetch(int type) const { 0066 return type == Data::Collection::Video; 0067 } 0068 0069 bool KinoTeatrFetcher::canSearch(Fetch::FetchKey k) const { 0070 return k == Title; 0071 } 0072 0073 void KinoTeatrFetcher::readConfigHook(const KConfigGroup& config_) { 0074 Q_UNUSED(config_); 0075 } 0076 0077 void KinoTeatrFetcher::search() { 0078 m_started = true; 0079 m_matches.clear(); 0080 0081 QUrl u(QString::fromLatin1(KINOTEATR_SEARCH_URL)); 0082 QUrlQuery q; 0083 0084 switch(request().key()) { 0085 case Title: 0086 // TODO: allow year in search query and parse it out? 0087 //q.addQueryItem(QStringLiteral("year"), QStringLiteral("yes")); 0088 q.addQueryItem(QStringLiteral("title"), request().value()); 0089 break; 0090 0091 default: 0092 myWarning() << source() << "- key not recognized:" << request().key(); 0093 stop(); 0094 return; 0095 } 0096 u.setQuery(q); 0097 // myDebug() << "url: " << u.url(); 0098 0099 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0100 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0101 connect(m_job.data(), &KJob::result, this, &KinoTeatrFetcher::slotComplete); 0102 } 0103 0104 void KinoTeatrFetcher::stop() { 0105 if(!m_started) { 0106 return; 0107 } 0108 0109 if(m_job) { 0110 m_job->kill(); 0111 m_job = nullptr; 0112 } 0113 m_started = false; 0114 emit signalDone(this); 0115 } 0116 0117 void KinoTeatrFetcher::slotComplete(KJob*) { 0118 if(m_job->error()) { 0119 m_job->uiDelegate()->showErrorMessage(); 0120 stop(); 0121 return; 0122 } 0123 0124 QByteArray data = m_job->data(); 0125 if(data.isEmpty()) { 0126 myDebug() << "no data"; 0127 stop(); 0128 return; 0129 } 0130 0131 const QString output = Tellico::decodeHTML(data); 0132 #if 0 0133 myWarning() << "Remove debug from kinoteatrfetcher.cpp"; 0134 QFile f(QStringLiteral("/tmp/test1.html")); 0135 if(f.open(QIODevice::WriteOnly)) { 0136 QTextStream t(&f); 0137 t.setCodec("UTF-8"); 0138 t << output; 0139 } 0140 f.close(); 0141 #endif 0142 0143 // look for a specific div, with an href and title, sometime uses single-quote, sometimes double-quotes 0144 QRegularExpression resultRx(QStringLiteral("<a class=\"uk-margin-small-bottom\" href=\"(.+?)\".+?</a>"), 0145 QRegularExpression::DotMatchesEverythingOption); 0146 QRegularExpression titleRx(QStringLiteral("<h2 class=\"uk-h4\">(.+?)</")); 0147 // the year is within the searchItemText as a 4-digit number, starting with 1 or 2 0148 QRegularExpression yearRx(QStringLiteral(" ([12]\\d\\d\\d)[ \"]")); 0149 0150 QString href, title, year; 0151 QRegularExpressionMatchIterator i = resultRx.globalMatch(output); 0152 while(i.hasNext() && m_started) { 0153 QRegularExpressionMatch topMatch = i.next(); 0154 const QString resultText = topMatch.captured(); 0155 href = topMatch.captured(1); 0156 QRegularExpressionMatch match = titleRx.match(resultText); 0157 if(match.hasMatch()) { 0158 title = match.captured(1); 0159 } 0160 // there can be multiple 0161 match = yearRx.match(resultText); 0162 if(match.hasMatch()) { 0163 year = match.captured(1); 0164 } 0165 if(!href.isEmpty()) { 0166 QUrl url(QString::fromLatin1(KINOTEATR_SEARCH_URL)); 0167 url = url.resolved(QUrl(href)); 0168 // myDebug() << url << title << year; 0169 FetchResult* r = new FetchResult(this, title, year); 0170 m_matches.insert(r->uid, url); 0171 emit signalResultFound(r); 0172 } 0173 } 0174 0175 // since the fetch is done, don't worry about holding the job pointer 0176 m_job = nullptr; 0177 stop(); 0178 } 0179 0180 Tellico::Data::EntryPtr KinoTeatrFetcher::fetchEntryHook(uint uid_) { 0181 // if we already grabbed this one, then just pull it out of the dict 0182 Data::EntryPtr entry = m_entries[uid_]; 0183 if(entry) { 0184 return entry; 0185 } 0186 0187 QUrl url = m_matches[uid_]; 0188 if(url.isEmpty()) { 0189 myWarning() << "no url in map"; 0190 return Data::EntryPtr(); 0191 } 0192 0193 const QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true)); 0194 if(results.isEmpty()) { 0195 myDebug() << "no text results"; 0196 return Data::EntryPtr(); 0197 } 0198 0199 #if 0 0200 myDebug() << url.url(); 0201 myWarning() << "Remove debug2 from kinoteatrfetcher.cpp"; 0202 QFile f(QStringLiteral("/tmp/test-kinoteatr.html")); 0203 if(f.open(QIODevice::WriteOnly)) { 0204 QTextStream t(&f); 0205 t.setCodec("UTF-8"); 0206 t << results; 0207 } 0208 f.close(); 0209 #endif 0210 0211 entry = parseEntry(results); 0212 if(!entry) { 0213 myDebug() << "error in processing entry"; 0214 return Data::EntryPtr(); 0215 } 0216 0217 QString newPath(url.path()); 0218 newPath.replace(QLatin1String("/film/"), QLatin1String("/film-persons/")); 0219 QUrl personUrl(url); 0220 personUrl.setPath(newPath); 0221 0222 const QString personsText = Tellico::decodeHTML(FileHandler::readDataFile(personUrl, true)); 0223 if(!personsText.isEmpty()) { 0224 parsePeople(entry, personsText); 0225 #if 0 0226 myWarning() << "Remove persons debug from kinoteatrfetcher.cpp"; 0227 myDebug() << personUrl.url(); 0228 QFile f2(QStringLiteral("/tmp/test-kinoteatr-persons.html")); 0229 if(f2.open(QIODevice::WriteOnly)) { 0230 QTextStream t(&f2); 0231 t.setCodec("UTF-8"); 0232 t << personsText; 0233 } 0234 f2.close(); 0235 #endif 0236 } 0237 0238 if(optionalFields().contains(QStringLiteral("kinoteatr"))) { 0239 Data::FieldPtr field(new Data::Field(QStringLiteral("kinoteatr"), i18n("Kino-Teatr Link"), Data::Field::URL)); 0240 field->setCategory(i18n("General")); 0241 entry->collection()->addField(field); 0242 entry->setField(QStringLiteral("kinoteatr"), url.url()); 0243 } 0244 0245 m_entries.insert(uid_, entry); // keep for later 0246 return entry; 0247 } 0248 0249 Tellico::Data::EntryPtr KinoTeatrFetcher::parseEntry(const QString& str_) { 0250 Data::CollPtr coll(new Data::VideoCollection(true)); 0251 Data::EntryPtr entry(new Data::Entry(coll)); 0252 coll->addEntries(entry); 0253 0254 const QRegularExpression tagRx(QLatin1String("<.*?>")); 0255 const QRegularExpression anchorRx(QStringLiteral("<a.+?href=[\"'].+?[\"'].*?>(.*?)</")); 0256 0257 QRegularExpression titleRx(QStringLiteral("<span itemprop=[\"']name[\"']>(.+?)</span")); 0258 QRegularExpressionMatch match = titleRx.match(str_); 0259 if(match.hasMatch()) { 0260 entry->setField(QStringLiteral("title"), match.captured(1).simplified()); 0261 } 0262 0263 if(optionalFields().contains(QStringLiteral("origtitle"))) { 0264 Data::FieldPtr f(new Data::Field(QStringLiteral("origtitle"), i18n("Original Title"))); 0265 f->setFormatType(FieldFormat::FormatTitle); 0266 coll->addField(f); 0267 0268 QRegularExpression origTitleRx(QStringLiteral("itemprop=\"alternativeHeadline\".*?>(.+?)</")); 0269 match = origTitleRx.match(str_); 0270 if(match.hasMatch()) { 0271 entry->setField(QStringLiteral("origtitle"), match.captured(1).simplified()); 0272 } 0273 } 0274 0275 QRegularExpression yearRx(QStringLiteral("Рік:.*?([12]\\d\\d\\d).*?</a"), 0276 QRegularExpression::DotMatchesEverythingOption); 0277 match = yearRx.match(str_); 0278 if(match.hasMatch()) { 0279 entry->setField(QStringLiteral("year"), match.captured(1)); 0280 } 0281 0282 QRegularExpression countryRx(QStringLiteral("Країна:(.*?)<br"), 0283 QRegularExpression::DotMatchesEverythingOption); 0284 match = countryRx.match(str_); 0285 if(match.hasMatch()) { 0286 const QString innerText = match.captured(1); 0287 QStringList countries; 0288 QRegularExpressionMatchIterator i = anchorRx.globalMatch(innerText); 0289 while(i.hasNext()) { 0290 match = i.next(); 0291 const QString s = match.captured(1).simplified(); 0292 if(!s.isEmpty()) { 0293 countries += s; 0294 } 0295 } 0296 if(!countries.isEmpty()) { 0297 countries.removeDuplicates(); 0298 entry->setField(QStringLiteral("nationality"), countries.join(Tellico::FieldFormat::delimiterString())); 0299 } 0300 } 0301 0302 QRegularExpression genreRx(QStringLiteral("itemprop=\"genre\">(.*?)<br"), 0303 QRegularExpression::DotMatchesEverythingOption); 0304 match = genreRx.match(str_); 0305 if(match.hasMatch()) { 0306 const QString innerText = match.captured(1); 0307 QStringList genres; 0308 QRegularExpressionMatchIterator i = anchorRx.globalMatch(innerText); 0309 while(i.hasNext()) { 0310 match = i.next(); 0311 const QString s = match.captured(1).simplified(); 0312 if(!s.isEmpty()) { 0313 genres += s; 0314 } 0315 } 0316 if(!genres.isEmpty()) { 0317 genres.removeDuplicates(); 0318 entry->setField(QStringLiteral("genre"), genres.join(Tellico::FieldFormat::delimiterString())); 0319 } 0320 } 0321 0322 QRegularExpression directorRx(QStringLiteral("itemprop=\"director\".*?>(.*?)<br"), 0323 QRegularExpression::DotMatchesEverythingOption); 0324 match = directorRx.match(str_); 0325 if(match.hasMatch()) { 0326 const QString innerText = match.captured(1); 0327 QStringList directors; 0328 QRegularExpressionMatchIterator i = anchorRx.globalMatch(innerText); 0329 while(i.hasNext()) { 0330 match = i.next(); 0331 QString s = match.captured(1).simplified(); 0332 if(!s.isEmpty()) { 0333 directors += s.remove(tagRx); 0334 } 0335 } 0336 if(!directors.isEmpty()) { 0337 entry->setField(QStringLiteral("director"), directors.join(Tellico::FieldFormat::delimiterString())); 0338 } 0339 } 0340 0341 QRegularExpression runtimeRx(QStringLiteral("Тривалість:.*?(\\d+).*?хв<br>"), 0342 QRegularExpression::DotMatchesEverythingOption); 0343 match = runtimeRx.match(str_); 0344 if(match.hasMatch()) { 0345 entry->setField(QStringLiteral("running-time"), match.captured(1)); 0346 } 0347 0348 QRegularExpression plotRx(QStringLiteral("itemprop=[\"']description[\"'].*?>(.+?)</div"), 0349 QRegularExpression::DotMatchesEverythingOption); 0350 match = plotRx.match(str_); 0351 if(match.hasMatch()) { 0352 entry->setField(QStringLiteral("plot"), Tellico::decodeHTML(match.captured(1).simplified())); 0353 } else { 0354 plotRx.setPattern(QStringLiteral("<meta name=\"og:description\" content=\"(.+?)\"")); 0355 match = plotRx.match(str_); 0356 if(match.hasMatch()) { 0357 entry->setField(QStringLiteral("plot"), Tellico::decodeHTML(match.captured(1))); 0358 } 0359 } 0360 0361 QString cover; 0362 QRegularExpression coverRx(QStringLiteral("<img\\s.*?src=[\"'](.+?)[\"'].+?itemprop=[\"']image[\"']")); 0363 match = coverRx.match(str_); 0364 if(match.hasMatch()) { 0365 cover = match.captured(1); 0366 } else { 0367 coverRx.setPattern(QStringLiteral("<meta property=\"og:image\" content=\"(.+?)\"")); 0368 match = coverRx.match(str_); 0369 if(match.hasMatch()) { 0370 cover = match.captured(1); 0371 } 0372 } 0373 if(!cover.isEmpty()) { 0374 // myDebug() << "cover:" << cover; 0375 const QString id = ImageFactory::addImage(QUrl::fromUserInput(cover), true /* quiet */); 0376 if(id.isEmpty()) { 0377 message(i18n("The cover image could not be loaded."), MessageHandler::Warning); 0378 } 0379 // empty image ID is ok 0380 entry->setField(QStringLiteral("cover"), id); 0381 } 0382 0383 return entry; 0384 } 0385 0386 void KinoTeatrFetcher::parsePeople(Data::EntryPtr entry_, const QString& str_) { 0387 if(!entry_) { 0388 myDebug() << "no entry"; 0389 return; 0390 } 0391 0392 QRegularExpression nameDivRx(QStringLiteral("<div.*?>(.+?)</div"), 0393 QRegularExpression::DotMatchesEverythingOption); 0394 QRegularExpression anchorRx(QStringLiteral("<a[^>]+?person[^>]+?>(.+?)</a")); 0395 QRegularExpression roleRx(QStringLiteral("<br>(.+?)$")); 0396 0397 QRegularExpression castRx(QStringLiteral("Актори(.+?)<(header|/section)"), 0398 QRegularExpression::DotMatchesEverythingOption); 0399 auto match = castRx.match(str_); 0400 if(match.hasMatch()) { 0401 const QString innerText = match.captured(1); 0402 QStringList actors, roles; 0403 auto i = nameDivRx.globalMatch(innerText); 0404 while(i.hasNext()) { 0405 match = i.next(); 0406 QRegularExpressionMatch anchorMatch = anchorRx.match(match.captured(1)); 0407 if(anchorMatch.hasMatch()) { 0408 actors += anchorMatch.captured(1).simplified(); 0409 auto roleMatch = roleRx.match(match.captured(1)); 0410 roles += roleMatch.hasMatch() ? roleMatch.captured(1).simplified() : QString(); 0411 } 0412 } 0413 // interleave actors and roles 0414 QStringList cast; 0415 for(int i = 0; i< actors.length(); ++i) { 0416 QString row = actors.at(i); 0417 if(!roles.at(i).isEmpty()) { 0418 row += FieldFormat::columnDelimiterString() + roles.at(i); 0419 } 0420 cast += row; 0421 } 0422 if(!cast.isEmpty()) { 0423 // myDebug() << cast; 0424 entry_->setField(QStringLiteral("cast"), cast.join(FieldFormat::rowDelimiterString())); 0425 } 0426 } 0427 0428 QRegularExpression writerRx(QStringLiteral("Сценаристи(.+?)<(header|/section)"), 0429 QRegularExpression::DotMatchesEverythingOption); 0430 match = writerRx.match(str_); 0431 if(match.hasMatch()) { 0432 const QString innerText = match.captured(1); 0433 QStringList writers; 0434 auto i = nameDivRx.globalMatch(innerText); 0435 while(i.hasNext()) { 0436 match = i.next(); 0437 auto anchorMatch = anchorRx.match(match.captured(1)); 0438 if(anchorMatch.hasMatch()) { 0439 writers += anchorMatch.captured(1).simplified(); 0440 } 0441 } 0442 if(!writers.isEmpty()) { 0443 entry_->setField(QStringLiteral("writer"), writers.join(FieldFormat::delimiterString())); 0444 } 0445 } 0446 0447 QRegularExpression producerRx(QStringLiteral("Продюсери(.+?)<(header|/section)"), 0448 QRegularExpression::DotMatchesEverythingOption); 0449 match = producerRx.match(str_); 0450 if(match.hasMatch()) { 0451 const QString innerText = match.captured(1); 0452 QStringList producers; 0453 auto i = nameDivRx.globalMatch(innerText); 0454 while(i.hasNext()) { 0455 match = i.next(); 0456 auto anchorMatch = anchorRx.match(match.captured(1)); 0457 if(anchorMatch.hasMatch()) { 0458 producers += anchorMatch.captured(1).simplified(); 0459 } 0460 } 0461 if(!producers.isEmpty()) { 0462 entry_->setField(QStringLiteral("producer"), producers.join(FieldFormat::delimiterString())); 0463 } 0464 } 0465 0466 QRegularExpression composerRx(QStringLiteral("Композитори(.+?)<(header|/section)"), 0467 QRegularExpression::DotMatchesEverythingOption); 0468 match = composerRx.match(str_); 0469 if(match.hasMatch()) { 0470 const QString innerText = match.captured(1); 0471 QStringList composers; 0472 auto i = nameDivRx.globalMatch(innerText); 0473 while(i.hasNext()) { 0474 match = i.next(); 0475 auto anchorMatch = anchorRx.match(match.captured(1)); 0476 if(anchorMatch.hasMatch()) { 0477 composers += anchorMatch.captured(1).simplified(); 0478 } 0479 } 0480 if(!composers.isEmpty()) { 0481 entry_->setField(QStringLiteral("composer"), composers.join(FieldFormat::delimiterString())); 0482 } 0483 } 0484 } 0485 0486 Tellico::Fetch::FetchRequest KinoTeatrFetcher::updateRequest(Data::EntryPtr entry_) { 0487 QString t = entry_->field(QStringLiteral("title")); 0488 if(!t.isEmpty()) { 0489 return FetchRequest(Fetch::Title, t); 0490 } 0491 return FetchRequest(); 0492 } 0493 0494 Tellico::Fetch::ConfigWidget* KinoTeatrFetcher::configWidget(QWidget* parent_) const { 0495 return new KinoTeatrFetcher::ConfigWidget(parent_); 0496 } 0497 0498 QString KinoTeatrFetcher::defaultName() { 0499 return QStringLiteral("Кіно-Театр (kino-teatr.ua)"); 0500 } 0501 0502 QString KinoTeatrFetcher::defaultIcon() { 0503 return favIcon("https://kino-teatr.ua"); 0504 } 0505 0506 Tellico::StringHash KinoTeatrFetcher::allOptionalFields() { 0507 StringHash hash; 0508 hash[QStringLiteral("origtitle")] = i18n("Original Title"); 0509 hash[QStringLiteral("kinoteatr")] = i18n("Kino-Teatr Link"); 0510 return hash; 0511 } 0512 0513 KinoTeatrFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const KinoTeatrFetcher* fetcher_) 0514 : Fetch::ConfigWidget(parent_) { 0515 QVBoxLayout* l = new QVBoxLayout(optionsWidget()); 0516 l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget())); 0517 l->addStretch(); 0518 0519 addFieldsWidget(KinoTeatrFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList()); 0520 } 0521 0522 QString KinoTeatrFetcher::ConfigWidget::preferredName() const { 0523 return KinoTeatrFetcher::defaultName(); 0524 }