File indexing completed on 2024-05-12 05:09:38

0001 /***************************************************************************
0002     Copyright (C) 2017 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "kinoteatrfetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../utils/string_utils.h"
0028 #include "../collections/videocollection.h"
0029 #include "../entry.h"
0030 #include "../fieldformat.h"
0031 #include "../core/filehandler.h"
0032 #include "../images/imagefactory.h"
0033 #include "../tellico_debug.h"
0034 
0035 #include <KLocalizedString>
0036 #include <KIO/Job>
0037 #include <KJobUiDelegate>
0038 #include <KJobWidgets/KJobWidgets>
0039 
0040 #include <QRegularExpression>
0041 #include <QLabel>
0042 #include <QFile>
0043 #include <QTextStream>
0044 #include <QVBoxLayout>
0045 #include <QUrlQuery>
0046 
0047 namespace {
0048   static const char* KINOTEATR_SEARCH_URL = "https://kino-teatr.ua/uk/main/films.phtml";
0049 }
0050 
0051 using namespace Tellico;
0052 using Tellico::Fetch::KinoTeatrFetcher;
0053 
0054 KinoTeatrFetcher::KinoTeatrFetcher(QObject* parent_)
0055     : Fetcher(parent_), m_started(false) {
0056 }
0057 
0058 KinoTeatrFetcher::~KinoTeatrFetcher() {
0059 }
0060 
0061 QString KinoTeatrFetcher::source() const {
0062   return m_name.isEmpty() ? defaultName() : m_name;
0063 }
0064 
0065 bool KinoTeatrFetcher::canFetch(int type) const {
0066   return type == Data::Collection::Video;
0067 }
0068 
0069 bool KinoTeatrFetcher::canSearch(Fetch::FetchKey k) const {
0070   return k == Title;
0071 }
0072 
0073 void KinoTeatrFetcher::readConfigHook(const KConfigGroup& config_) {
0074   Q_UNUSED(config_);
0075 }
0076 
0077 void KinoTeatrFetcher::search() {
0078   m_started = true;
0079   m_matches.clear();
0080 
0081   QUrl u(QString::fromLatin1(KINOTEATR_SEARCH_URL));
0082   QUrlQuery q;
0083 
0084   switch(request().key()) {
0085     case Title:
0086       // TODO: allow year in search query and parse it out?
0087       //q.addQueryItem(QStringLiteral("year"), QStringLiteral("yes"));
0088       q.addQueryItem(QStringLiteral("title"), request().value());
0089       break;
0090 
0091     default:
0092       myWarning() << source() << "- key not recognized:" << request().key();
0093       stop();
0094       return;
0095   }
0096   u.setQuery(q);
0097 //  myDebug() << "url: " << u.url();
0098 
0099   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0100   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0101   connect(m_job.data(), &KJob::result, this, &KinoTeatrFetcher::slotComplete);
0102 }
0103 
0104 void KinoTeatrFetcher::stop() {
0105   if(!m_started) {
0106     return;
0107   }
0108 
0109   if(m_job) {
0110     m_job->kill();
0111     m_job = nullptr;
0112   }
0113   m_started = false;
0114   emit signalDone(this);
0115 }
0116 
0117 void KinoTeatrFetcher::slotComplete(KJob*) {
0118   if(m_job->error()) {
0119     m_job->uiDelegate()->showErrorMessage();
0120     stop();
0121     return;
0122   }
0123 
0124   QByteArray data = m_job->data();
0125   if(data.isEmpty()) {
0126     myDebug() << "no data";
0127     stop();
0128     return;
0129   }
0130 
0131   const QString output = Tellico::decodeHTML(data);
0132 #if 0
0133   myWarning() << "Remove debug from kinoteatrfetcher.cpp";
0134   QFile f(QStringLiteral("/tmp/test1.html"));
0135   if(f.open(QIODevice::WriteOnly)) {
0136     QTextStream t(&f);
0137     t.setCodec("UTF-8");
0138     t << output;
0139   }
0140   f.close();
0141 #endif
0142 
0143   // look for a specific div, with an href and title, sometime uses single-quote, sometimes double-quotes
0144   QRegularExpression resultRx(QStringLiteral("<a class=\"uk-margin-small-bottom\" href=\"(.+?)\".+?</a>"),
0145                               QRegularExpression::DotMatchesEverythingOption);
0146   QRegularExpression titleRx(QStringLiteral("<h2 class=\"uk-h4\">(.+?)</"));
0147   // the year is within the searchItemText as a 4-digit number, starting with 1 or 2
0148   QRegularExpression yearRx(QStringLiteral(" ([12]\\d\\d\\d)[ \"]"));
0149 
0150   QString href, title, year;
0151   QRegularExpressionMatchIterator i = resultRx.globalMatch(output);
0152   while(i.hasNext() && m_started) {
0153     QRegularExpressionMatch topMatch = i.next();
0154     const QString resultText = topMatch.captured();
0155     href = topMatch.captured(1);
0156     QRegularExpressionMatch match = titleRx.match(resultText);
0157     if(match.hasMatch()) {
0158       title = match.captured(1);
0159     }
0160     // there can be multiple
0161     match = yearRx.match(resultText);
0162     if(match.hasMatch()) {
0163       year = match.captured(1);
0164     }
0165     if(!href.isEmpty()) {
0166       QUrl url(QString::fromLatin1(KINOTEATR_SEARCH_URL));
0167       url = url.resolved(QUrl(href));
0168 //      myDebug() << url << title << year;
0169       FetchResult* r = new FetchResult(this, title, year);
0170       m_matches.insert(r->uid, url);
0171       emit signalResultFound(r);
0172     }
0173   }
0174 
0175   // since the fetch is done, don't worry about holding the job pointer
0176   m_job = nullptr;
0177   stop();
0178 }
0179 
0180 Tellico::Data::EntryPtr KinoTeatrFetcher::fetchEntryHook(uint uid_) {
0181   // if we already grabbed this one, then just pull it out of the dict
0182   Data::EntryPtr entry = m_entries[uid_];
0183   if(entry) {
0184     return entry;
0185   }
0186 
0187   QUrl url = m_matches[uid_];
0188   if(url.isEmpty()) {
0189     myWarning() << "no url in map";
0190     return Data::EntryPtr();
0191   }
0192 
0193   const QString results = Tellico::decodeHTML(FileHandler::readDataFile(url, true));
0194   if(results.isEmpty()) {
0195     myDebug() << "no text results";
0196     return Data::EntryPtr();
0197   }
0198 
0199 #if 0
0200   myDebug() << url.url();
0201   myWarning() << "Remove debug2 from kinoteatrfetcher.cpp";
0202   QFile f(QStringLiteral("/tmp/test-kinoteatr.html"));
0203   if(f.open(QIODevice::WriteOnly)) {
0204     QTextStream t(&f);
0205     t.setCodec("UTF-8");
0206     t << results;
0207   }
0208   f.close();
0209 #endif
0210 
0211   entry = parseEntry(results);
0212   if(!entry) {
0213     myDebug() << "error in processing entry";
0214     return Data::EntryPtr();
0215   }
0216 
0217   QString newPath(url.path());
0218   newPath.replace(QLatin1String("/film/"), QLatin1String("/film-persons/"));
0219   QUrl personUrl(url);
0220   personUrl.setPath(newPath);
0221 
0222   const QString personsText = Tellico::decodeHTML(FileHandler::readDataFile(personUrl, true));
0223   if(!personsText.isEmpty()) {
0224     parsePeople(entry, personsText);
0225 #if 0
0226     myWarning() << "Remove persons debug from kinoteatrfetcher.cpp";
0227     myDebug() << personUrl.url();
0228     QFile f2(QStringLiteral("/tmp/test-kinoteatr-persons.html"));
0229     if(f2.open(QIODevice::WriteOnly)) {
0230       QTextStream t(&f2);
0231       t.setCodec("UTF-8");
0232       t << personsText;
0233     }
0234     f2.close();
0235 #endif
0236   }
0237 
0238   if(optionalFields().contains(QStringLiteral("kinoteatr"))) {
0239     Data::FieldPtr field(new Data::Field(QStringLiteral("kinoteatr"), i18n("Kino-Teatr Link"), Data::Field::URL));
0240     field->setCategory(i18n("General"));
0241     entry->collection()->addField(field);
0242     entry->setField(QStringLiteral("kinoteatr"), url.url());
0243   }
0244 
0245   m_entries.insert(uid_, entry); // keep for later
0246   return entry;
0247 }
0248 
0249 Tellico::Data::EntryPtr KinoTeatrFetcher::parseEntry(const QString& str_) {
0250   Data::CollPtr coll(new Data::VideoCollection(true));
0251   Data::EntryPtr entry(new Data::Entry(coll));
0252   coll->addEntries(entry);
0253 
0254   const QRegularExpression tagRx(QLatin1String("<.*?>"));
0255   const QRegularExpression anchorRx(QStringLiteral("<a.+?href=[\"'].+?[\"'].*?>(.*?)</"));
0256 
0257   QRegularExpression titleRx(QStringLiteral("<span itemprop=[\"']name[\"']>(.+?)</span"));
0258   QRegularExpressionMatch match = titleRx.match(str_);
0259   if(match.hasMatch()) {
0260     entry->setField(QStringLiteral("title"), match.captured(1).simplified());
0261   }
0262 
0263   if(optionalFields().contains(QStringLiteral("origtitle"))) {
0264     Data::FieldPtr f(new Data::Field(QStringLiteral("origtitle"), i18n("Original Title")));
0265     f->setFormatType(FieldFormat::FormatTitle);
0266     coll->addField(f);
0267 
0268     QRegularExpression origTitleRx(QStringLiteral("itemprop=\"alternativeHeadline\".*?>(.+?)</"));
0269     match = origTitleRx.match(str_);
0270     if(match.hasMatch()) {
0271       entry->setField(QStringLiteral("origtitle"), match.captured(1).simplified());
0272     }
0273   }
0274 
0275   QRegularExpression yearRx(QStringLiteral("Рік:.*?([12]\\d\\d\\d).*?</a"),
0276                             QRegularExpression::DotMatchesEverythingOption);
0277   match = yearRx.match(str_);
0278   if(match.hasMatch()) {
0279     entry->setField(QStringLiteral("year"), match.captured(1));
0280   }
0281 
0282   QRegularExpression countryRx(QStringLiteral("Країна:(.*?)<br"),
0283                                QRegularExpression::DotMatchesEverythingOption);
0284   match = countryRx.match(str_);
0285   if(match.hasMatch()) {
0286     const QString innerText = match.captured(1);
0287     QStringList countries;
0288     QRegularExpressionMatchIterator i = anchorRx.globalMatch(innerText);
0289     while(i.hasNext()) {
0290       match = i.next();
0291       const QString s = match.captured(1).simplified();
0292       if(!s.isEmpty()) {
0293         countries += s;
0294       }
0295     }
0296     if(!countries.isEmpty()) {
0297       countries.removeDuplicates();
0298       entry->setField(QStringLiteral("nationality"), countries.join(Tellico::FieldFormat::delimiterString()));
0299     }
0300   }
0301 
0302   QRegularExpression genreRx(QStringLiteral("itemprop=\"genre\">(.*?)<br"),
0303                              QRegularExpression::DotMatchesEverythingOption);
0304   match = genreRx.match(str_);
0305   if(match.hasMatch()) {
0306     const QString innerText = match.captured(1);
0307     QStringList genres;
0308     QRegularExpressionMatchIterator i = anchorRx.globalMatch(innerText);
0309     while(i.hasNext()) {
0310       match = i.next();
0311       const QString s = match.captured(1).simplified();
0312       if(!s.isEmpty()) {
0313         genres += s;
0314       }
0315     }
0316     if(!genres.isEmpty()) {
0317       genres.removeDuplicates();
0318       entry->setField(QStringLiteral("genre"), genres.join(Tellico::FieldFormat::delimiterString()));
0319     }
0320   }
0321 
0322   QRegularExpression directorRx(QStringLiteral("itemprop=\"director\".*?>(.*?)<br"),
0323                                 QRegularExpression::DotMatchesEverythingOption);
0324   match = directorRx.match(str_);
0325   if(match.hasMatch()) {
0326     const QString innerText = match.captured(1);
0327     QStringList directors;
0328     QRegularExpressionMatchIterator i = anchorRx.globalMatch(innerText);
0329     while(i.hasNext()) {
0330       match = i.next();
0331       QString s = match.captured(1).simplified();
0332       if(!s.isEmpty()) {
0333         directors += s.remove(tagRx);
0334       }
0335     }
0336     if(!directors.isEmpty()) {
0337       entry->setField(QStringLiteral("director"), directors.join(Tellico::FieldFormat::delimiterString()));
0338     }
0339   }
0340 
0341   QRegularExpression runtimeRx(QStringLiteral("Тривалість:.*?(\\d+).*?хв<br>"),
0342                                QRegularExpression::DotMatchesEverythingOption);
0343   match = runtimeRx.match(str_);
0344   if(match.hasMatch()) {
0345     entry->setField(QStringLiteral("running-time"), match.captured(1));
0346   }
0347 
0348   QRegularExpression plotRx(QStringLiteral("itemprop=[\"']description[\"'].*?>(.+?)</div"),
0349                             QRegularExpression::DotMatchesEverythingOption);
0350   match = plotRx.match(str_);
0351   if(match.hasMatch()) {
0352     entry->setField(QStringLiteral("plot"), Tellico::decodeHTML(match.captured(1).simplified()));
0353   } else {
0354     plotRx.setPattern(QStringLiteral("<meta name=\"og:description\" content=\"(.+?)\""));
0355     match = plotRx.match(str_);
0356     if(match.hasMatch()) {
0357       entry->setField(QStringLiteral("plot"), Tellico::decodeHTML(match.captured(1)));
0358     }
0359   }
0360 
0361   QString cover;
0362   QRegularExpression coverRx(QStringLiteral("<img\\s.*?src=[\"'](.+?)[\"'].+?itemprop=[\"']image[\"']"));
0363   match = coverRx.match(str_);
0364   if(match.hasMatch()) {
0365     cover = match.captured(1);
0366   } else {
0367     coverRx.setPattern(QStringLiteral("<meta property=\"og:image\" content=\"(.+?)\""));
0368     match = coverRx.match(str_);
0369     if(match.hasMatch()) {
0370       cover = match.captured(1);
0371     }
0372   }
0373   if(!cover.isEmpty()) {
0374 //    myDebug() << "cover:" << cover;
0375     const QString id = ImageFactory::addImage(QUrl::fromUserInput(cover), true /* quiet */);
0376     if(id.isEmpty()) {
0377       message(i18n("The cover image could not be loaded."), MessageHandler::Warning);
0378     }
0379     // empty image ID is ok
0380     entry->setField(QStringLiteral("cover"), id);
0381   }
0382 
0383   return entry;
0384 }
0385 
0386 void KinoTeatrFetcher::parsePeople(Data::EntryPtr entry_, const QString& str_) {
0387   if(!entry_) {
0388     myDebug() << "no entry";
0389     return;
0390   }
0391 
0392   QRegularExpression nameDivRx(QStringLiteral("<div.*?>(.+?)</div"),
0393                                QRegularExpression::DotMatchesEverythingOption);
0394   QRegularExpression anchorRx(QStringLiteral("<a[^>]+?person[^>]+?>(.+?)</a"));
0395   QRegularExpression roleRx(QStringLiteral("<br>(.+?)$"));
0396 
0397   QRegularExpression castRx(QStringLiteral("Актори(.+?)<(header|/section)"),
0398                             QRegularExpression::DotMatchesEverythingOption);
0399   auto match = castRx.match(str_);
0400   if(match.hasMatch()) {
0401     const QString innerText = match.captured(1);
0402     QStringList actors, roles;
0403     auto i = nameDivRx.globalMatch(innerText);
0404     while(i.hasNext()) {
0405       match = i.next();
0406       QRegularExpressionMatch anchorMatch = anchorRx.match(match.captured(1));
0407       if(anchorMatch.hasMatch()) {
0408         actors += anchorMatch.captured(1).simplified();
0409         auto roleMatch = roleRx.match(match.captured(1));
0410         roles += roleMatch.hasMatch() ? roleMatch.captured(1).simplified() : QString();
0411       }
0412     }
0413     // interleave actors and roles
0414     QStringList cast;
0415     for(int i = 0; i< actors.length(); ++i) {
0416       QString row = actors.at(i);
0417       if(!roles.at(i).isEmpty()) {
0418         row += FieldFormat::columnDelimiterString() + roles.at(i);
0419       }
0420       cast += row;
0421     }
0422     if(!cast.isEmpty()) {
0423 //      myDebug() << cast;
0424       entry_->setField(QStringLiteral("cast"), cast.join(FieldFormat::rowDelimiterString()));
0425     }
0426   }
0427 
0428   QRegularExpression writerRx(QStringLiteral("Сценаристи(.+?)<(header|/section)"),
0429                               QRegularExpression::DotMatchesEverythingOption);
0430   match = writerRx.match(str_);
0431   if(match.hasMatch()) {
0432     const QString innerText = match.captured(1);
0433     QStringList writers;
0434     auto i = nameDivRx.globalMatch(innerText);
0435     while(i.hasNext()) {
0436       match = i.next();
0437       auto anchorMatch = anchorRx.match(match.captured(1));
0438       if(anchorMatch.hasMatch()) {
0439         writers += anchorMatch.captured(1).simplified();
0440       }
0441     }
0442     if(!writers.isEmpty()) {
0443       entry_->setField(QStringLiteral("writer"), writers.join(FieldFormat::delimiterString()));
0444     }
0445   }
0446 
0447   QRegularExpression producerRx(QStringLiteral("Продюсери(.+?)<(header|/section)"),
0448                                 QRegularExpression::DotMatchesEverythingOption);
0449   match = producerRx.match(str_);
0450   if(match.hasMatch()) {
0451     const QString innerText = match.captured(1);
0452     QStringList producers;
0453     auto i = nameDivRx.globalMatch(innerText);
0454     while(i.hasNext()) {
0455       match = i.next();
0456       auto anchorMatch = anchorRx.match(match.captured(1));
0457       if(anchorMatch.hasMatch()) {
0458         producers += anchorMatch.captured(1).simplified();
0459       }
0460     }
0461     if(!producers.isEmpty()) {
0462       entry_->setField(QStringLiteral("producer"), producers.join(FieldFormat::delimiterString()));
0463     }
0464   }
0465 
0466   QRegularExpression composerRx(QStringLiteral("Композитори(.+?)<(header|/section)"),
0467                                 QRegularExpression::DotMatchesEverythingOption);
0468   match = composerRx.match(str_);
0469   if(match.hasMatch()) {
0470     const QString innerText = match.captured(1);
0471     QStringList composers;
0472     auto i = nameDivRx.globalMatch(innerText);
0473     while(i.hasNext()) {
0474       match = i.next();
0475       auto anchorMatch = anchorRx.match(match.captured(1));
0476       if(anchorMatch.hasMatch()) {
0477         composers += anchorMatch.captured(1).simplified();
0478       }
0479     }
0480     if(!composers.isEmpty()) {
0481       entry_->setField(QStringLiteral("composer"), composers.join(FieldFormat::delimiterString()));
0482     }
0483   }
0484 }
0485 
0486 Tellico::Fetch::FetchRequest KinoTeatrFetcher::updateRequest(Data::EntryPtr entry_) {
0487   QString t = entry_->field(QStringLiteral("title"));
0488   if(!t.isEmpty()) {
0489     return FetchRequest(Fetch::Title, t);
0490   }
0491   return FetchRequest();
0492 }
0493 
0494 Tellico::Fetch::ConfigWidget* KinoTeatrFetcher::configWidget(QWidget* parent_) const {
0495   return new KinoTeatrFetcher::ConfigWidget(parent_);
0496 }
0497 
0498 QString KinoTeatrFetcher::defaultName() {
0499   return QStringLiteral("Кіно-Театр (kino-teatr.ua)");
0500 }
0501 
0502 QString KinoTeatrFetcher::defaultIcon() {
0503   return favIcon("https://kino-teatr.ua");
0504 }
0505 
0506 Tellico::StringHash KinoTeatrFetcher::allOptionalFields() {
0507   StringHash hash;
0508   hash[QStringLiteral("origtitle")] = i18n("Original Title");
0509   hash[QStringLiteral("kinoteatr")] = i18n("Kino-Teatr Link");
0510   return hash;
0511 }
0512 
0513 KinoTeatrFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const KinoTeatrFetcher* fetcher_)
0514     : Fetch::ConfigWidget(parent_) {
0515   QVBoxLayout* l = new QVBoxLayout(optionsWidget());
0516   l->addWidget(new QLabel(i18n("This source has no options."), optionsWidget()));
0517   l->addStretch();
0518 
0519   addFieldsWidget(KinoTeatrFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList());
0520 }
0521 
0522 QString KinoTeatrFetcher::ConfigWidget::preferredName() const {
0523   return KinoTeatrFetcher::defaultName();
0524 }