File indexing completed on 2024-05-19 16:18:42

0001 /***************************************************************************
0002     Copyright (C) 2005-2020 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "entrezfetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../collection.h"
0028 #include "../entry.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../translators/xslthandler.h"
0032 #include "../translators/tellicoimporter.h"
0033 #include "../utils/datafileregistry.h"
0034 #include "../tellico_debug.h"
0035 
0036 #include <KLocalizedString>
0037 #include <KIO/Job>
0038 #include <KIO/JobUiDelegate>
0039 #include <KConfigGroup>
0040 #include <KJobWidgets/KJobWidgets>
0041 
0042 #include <QDomDocument>
0043 #include <QLabel>
0044 #include <QFile>
0045 #include <QTextStream>
0046 #include <QGridLayout>
0047 #include <QLineEdit>
0048 #include <QUrlQuery>
0049 #include <QThread>
0050 #include <QJsonDocument>
0051 #include <QJsonObject>
0052 
0053 namespace {
0054   static const int ENTREZ_MAX_RETURNS_TOTAL = 25;
0055   static const char* ENTREZ_BASE_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/";
0056   static const char* ENTREZ_SEARCH_CGI = "esearch.fcgi";
0057   static const char* ENTREZ_SUMMARY_CGI = "esummary.fcgi";
0058   static const char* ENTREZ_FETCH_CGI = "efetch.fcgi";
0059   static const char* ENTREZ_LINK_CGI = "elink.fcgi";
0060   static const char* ENTREZ_DEFAULT_DATABASE = "pubmed";
0061 }
0062 
0063 using namespace Tellico;
0064 using namespace Tellico::Fetch;
0065 using Tellico::Fetch::EntrezFetcher;
0066 
0067 EntrezFetcher::EntrezFetcher(QObject* parent_) : Fetcher(parent_), m_xsltHandler(nullptr),
0068     m_start(1), m_total(-1), m_step(Begin), m_started(false) {
0069   m_idleTime.start();
0070 }
0071 
0072 EntrezFetcher::~EntrezFetcher() {
0073 }
0074 
0075 QString EntrezFetcher::source() const {
0076   return m_name.isEmpty() ? defaultName() : m_name;
0077 }
0078 
0079 bool EntrezFetcher::canSearch(Fetch::FetchKey k) const {
0080   return k == Title || k == Person || k == Keyword || k == Raw || k == PubmedID || k == DOI;
0081 }
0082 
0083 bool EntrezFetcher::canFetch(int type) const {
0084   return type == Data::Collection::Bibtex;
0085 }
0086 
0087 void EntrezFetcher::readConfigHook(const KConfigGroup& config_) {
0088   QString s = config_.readEntry("Database", ENTREZ_DEFAULT_DATABASE); // default to pubmed
0089   if(!s.isEmpty()) {
0090     m_dbname = s;
0091   }
0092   QString k = config_.readEntry("API Key");
0093   if(!k.isEmpty()) {
0094     m_apiKey = k;
0095   }
0096 }
0097 
0098 void EntrezFetcher::search() {
0099   m_started = true;
0100   m_start = 1;
0101   m_total = -1;
0102 
0103   if(m_dbname.isEmpty()) {
0104     m_dbname = QLatin1String(ENTREZ_DEFAULT_DATABASE);
0105   }
0106 
0107   QUrl u(QString::fromLatin1(ENTREZ_BASE_URL));
0108   u.setPath(u.path() + QLatin1String(ENTREZ_SEARCH_CGI));
0109   QUrlQuery q;
0110   q.addQueryItem(QStringLiteral("tool"),       QStringLiteral("Tellico"));
0111   q.addQueryItem(QStringLiteral("retmode"),    QStringLiteral("xml"));
0112   q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y"));
0113   q.addQueryItem(QStringLiteral("retmax"),     QStringLiteral("1")); // we're just getting the count
0114   q.addQueryItem(QStringLiteral("db"),         m_dbname);
0115   q.addQueryItem(QStringLiteral("term"),       request().value());
0116   switch(request().key()) {
0117     case Title:
0118       q.addQueryItem(QStringLiteral("field"), QStringLiteral("titl"));
0119       break;
0120 
0121     case Person:
0122       q.addQueryItem(QStringLiteral("field"), QStringLiteral("auth"));
0123       break;
0124 
0125     case Keyword:
0126       // for Tellico Keyword searches basically mean search for any field matching
0127 //      q.addQueryItem(QLatin1String("field"), QLatin1String("word"));
0128       break;
0129 
0130     case PubmedID:
0131       q.addQueryItem(QStringLiteral("field"), QStringLiteral("pmid"));
0132       break;
0133 
0134     case DOI:
0135     case Raw:
0136       // for DOI, enough to match any field to DOI value
0137       //q.setQuery(u.query() + QLatin1Char('&') + request().value());
0138       break;
0139 
0140     default:
0141       myWarning() << "key not supported:" << request().key();
0142       stop();
0143       return;
0144   }
0145   if(!m_apiKey.isEmpty()) {
0146     q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0147   }
0148   u.setQuery(q);
0149 
0150   m_step = Search;
0151 //  myLog() << "search url: " << u.url();
0152   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0153   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0154   connect(m_job.data(), &KJob::result,
0155           this, &EntrezFetcher::slotComplete);
0156   markTime();
0157 }
0158 
0159 void EntrezFetcher::continueSearch() {
0160   m_started = true;
0161   doSummary();
0162 }
0163 
0164 void EntrezFetcher::stop() {
0165   if(!m_started) {
0166     return;
0167   }
0168   if(m_job) {
0169     m_job->kill();
0170     m_job = nullptr;
0171   }
0172   m_started = false;
0173   m_step = Begin;
0174   emit signalDone(this);
0175 }
0176 
0177 void EntrezFetcher::slotComplete(KJob*) {
0178   Q_ASSERT(m_job);
0179   if(m_job->error()) {
0180     m_job->uiDelegate()->showErrorMessage();
0181     stop();
0182     return;
0183   }
0184 
0185   QByteArray data = m_job->data();
0186   if(data.isEmpty()) {
0187     myDebug() << "no data";
0188     stop();
0189     return;
0190   }
0191   // see bug 319662. If fetcher is cancelled, job is killed
0192   // if the pointer is retained, it gets double-deleted
0193   m_job = nullptr;
0194 
0195 #if 0
0196   myWarning() << "Remove debug from entrezfetcher.cpp: " << __LINE__;
0197   QFile f(QLatin1String("/tmp/test.xml"));
0198   if(f.open(QIODevice::WriteOnly)) {
0199     QTextStream t(&f);
0200     t.setCodec("UTF-8");
0201     t << data;
0202   }
0203   f.close();
0204 #endif
0205 
0206   switch(m_step) {
0207     case Search:
0208       searchResults(data);
0209       break;
0210     case Summary:
0211       summaryResults(data);
0212       break;
0213     case Begin:
0214     case Fetch:
0215     default:
0216       myLog() << "wrong step =" << m_step;
0217       stop();
0218       break;
0219   }
0220 }
0221 
0222 void EntrezFetcher::searchResults(const QByteArray& data_) {
0223   QDomDocument dom;
0224   if(!dom.setContent(data_, false)) {
0225     myWarning() << "server did not return valid XML.";
0226     stop();
0227     return;
0228   }
0229   // find Count, QueryKey, and WebEnv elements
0230   int count = 0;
0231   for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) {
0232     QDomElement e = n.toElement();
0233     if(e.isNull()) {
0234       continue;
0235     }
0236     if(e.tagName() == QLatin1String("Count")) {
0237       m_total = e.text().toInt();
0238       ++count;
0239     } else if(e.tagName() == QLatin1String("QueryKey")) {
0240       m_queryKey = e.text();
0241       ++count;
0242     } else if(e.tagName() == QLatin1String("WebEnv")) {
0243       m_webEnv = e.text();
0244       ++count;
0245     }
0246     if(count >= 3) {
0247       break; // found them all
0248     }
0249   }
0250 
0251   doSummary();
0252 }
0253 
0254 void EntrezFetcher::doSummary() {
0255   QUrl u(QString::fromLatin1(ENTREZ_BASE_URL));
0256   u.setPath(u.path() + QLatin1String(ENTREZ_SUMMARY_CGI));
0257   QUrlQuery q;
0258   q.addQueryItem(QStringLiteral("tool"),       QStringLiteral("Tellico"));
0259   q.addQueryItem(QStringLiteral("retmode"),    QStringLiteral("xml"));
0260   if(m_start > 1) {
0261     q.addQueryItem(QStringLiteral("retstart"),   QString::number(m_start));
0262   }
0263   q.addQueryItem(QStringLiteral("retmax"),     QString::number(qMin(m_total-m_start-1, ENTREZ_MAX_RETURNS_TOTAL)));
0264   q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y"));
0265   q.addQueryItem(QStringLiteral("db"),         m_dbname);
0266   q.addQueryItem(QStringLiteral("query_key"),  m_queryKey);
0267   q.addQueryItem(QStringLiteral("WebEnv"),     m_webEnv);
0268   if(!m_apiKey.isEmpty()) {
0269     q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0270   }
0271   u.setQuery(q);
0272 
0273   m_step = Summary;
0274 //  myLog() << "summary url:" << u.url();
0275   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0276   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0277   connect(m_job.data(), &KJob::result,
0278           this, &EntrezFetcher::slotComplete);
0279   markTime();
0280 }
0281 
0282 void EntrezFetcher::summaryResults(const QByteArray& data_) {
0283   QDomDocument dom;
0284   if(!dom.setContent(data_, false)) {
0285     myWarning() << "server did not return valid XML.";
0286     stop();
0287     return;
0288   }
0289   // top child is eSummaryResult
0290   // all children are DocSum
0291   for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) {
0292     QDomElement e = n.toElement();
0293     if(e.isNull() || e.tagName() != QLatin1String("DocSum")) {
0294       continue;
0295     }
0296     QDomNodeList nodes = e.elementsByTagName(QStringLiteral("Id"));
0297     if(nodes.count() == 0) {
0298       myDebug() << "no Id elements";
0299       continue;
0300     }
0301     int id = nodes.item(0).toElement().text().toInt();
0302     QString title, pubdate, authors;
0303     nodes = e.elementsByTagName(QStringLiteral("Item"));
0304     for(int j = 0; j < nodes.count(); ++j) {
0305       if(nodes.item(j).toElement().attribute(QStringLiteral("Name")) == QLatin1String("Title")) {
0306         title = nodes.item(j).toElement().text();
0307       } else if(nodes.item(j).toElement().attribute(QStringLiteral("Name")) == QLatin1String("PubDate")) {
0308         pubdate = nodes.item(j).toElement().text();
0309       } else if(nodes.item(j).toElement().attribute(QStringLiteral("Name")) == QLatin1String("AuthorList")) {
0310         QStringList list;
0311         for(QDomNode aNode = nodes.item(j).firstChild(); !aNode.isNull(); aNode = aNode.nextSibling()) {
0312           // lazy, assume all children Items are authors
0313           if(aNode.nodeName() == QLatin1String("Item")) {
0314             list << aNode.toElement().text();
0315           }
0316         }
0317         authors = list.join(FieldFormat::delimiterString());
0318       }
0319       if(!title.isEmpty() && !pubdate.isEmpty() && !authors.isEmpty()) {
0320         break; // done now
0321       }
0322     }
0323     FetchResult* r = new FetchResult(this, title, pubdate + QLatin1Char('/') + authors);
0324     m_matches.insert(r->uid, id);
0325     emit signalResultFound(r);
0326   }
0327   m_start = m_matches.count() + 1;
0328   m_hasMoreResults = m_start <= m_total;
0329   stop(); // done searching
0330 }
0331 
0332 Tellico::Data::EntryPtr EntrezFetcher::fetchEntryHook(uint uid_) {
0333   // if we already grabbed this one, then just pull it out of the dict
0334   Data::EntryPtr entry = m_entries[uid_];
0335   if(entry) {
0336     return entry;
0337   }
0338 
0339   if(!m_matches.contains(uid_)) {
0340     return Data::EntryPtr();
0341   }
0342 
0343   if(!m_xsltHandler) {
0344     initXSLTHandler();
0345     if(!m_xsltHandler) { // probably an error somewhere in the stylesheet loading
0346       stop();
0347       return Data::EntryPtr();
0348     }
0349   }
0350 
0351   int id = m_matches[uid_];
0352 
0353   QUrl u(QString::fromLatin1(ENTREZ_BASE_URL));
0354   u.setPath(u.path() + QLatin1String(ENTREZ_FETCH_CGI));
0355   QUrlQuery q;
0356   q.addQueryItem(QStringLiteral("tool"),       QStringLiteral("Tellico"));
0357   q.addQueryItem(QStringLiteral("retmode"),    QStringLiteral("xml"));
0358   q.addQueryItem(QStringLiteral("rettype"),    QStringLiteral("abstract"));
0359   q.addQueryItem(QStringLiteral("db"),         m_dbname);
0360   q.addQueryItem(QStringLiteral("id"),         QString::number(id));
0361   if(!m_apiKey.isEmpty()) {
0362     q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0363   }
0364   u.setQuery(q);
0365 
0366   // now it's synchronous
0367 //  myDebug() << "id url:" << u.url();
0368   markTime();
0369   QString xmlOutput = FileHandler::readXMLFile(u, true /*quiet*/);
0370   if(xmlOutput.isEmpty()) {
0371     myWarning() << "unable to download " << u;
0372     return Data::EntryPtr();
0373   }
0374 #if 0
0375   myWarning() << "turn me off in entrezfetcher.cpp!";
0376   QFile f1(QLatin1String("/tmp/test-entry.xml"));
0377   if(f1.open(QIODevice::WriteOnly)) {
0378     QTextStream t(&f1);
0379     t.setCodec("UTF-8");
0380     t << xmlOutput;
0381   }
0382   f1.close();
0383 #endif
0384   QString str = m_xsltHandler->applyStylesheet(xmlOutput);
0385   if(str.isEmpty()) {
0386     // might be an API error, and message is in JSON
0387     QJsonDocument doc = QJsonDocument::fromJson(xmlOutput.toUtf8());
0388     if(!doc.isNull() && doc.object().contains(QStringLiteral("error"))) {
0389       const QString error = doc.object().value(QStringLiteral("error")).toString();
0390       message(error, MessageHandler::Error);
0391       myLog() << "EntrezFetcher -" << error;
0392     }
0393     return Data::EntryPtr();
0394   }
0395   Import::TellicoImporter imp(str);
0396   Data::CollPtr coll = imp.collection();
0397   if(!coll) {
0398     myWarning() << "invalid collection";
0399     return Data::EntryPtr();
0400   }
0401   if(coll->entryCount() == 0) {
0402     myDebug() << "no entries in collection";
0403     return Data::EntryPtr();
0404   } else if(coll->entryCount() > 1) {
0405     myDebug() << "collection has multiple entries, taking first one";
0406   }
0407 
0408   Data::EntryPtr e = coll->entries().front();
0409 
0410   // try to get a link, but only if necessary
0411   if(optionalFields().contains(QStringLiteral("url"))) {
0412     QUrl link(QString::fromLatin1(ENTREZ_BASE_URL));
0413     link.setPath(link.path() + QLatin1String(ENTREZ_LINK_CGI));
0414     QUrlQuery q;
0415     q.addQueryItem(QStringLiteral("tool"),   QStringLiteral("Tellico"));
0416     q.addQueryItem(QStringLiteral("cmd"),    QStringLiteral("llinks"));
0417     q.addQueryItem(QStringLiteral("db"),     m_dbname);
0418     q.addQueryItem(QStringLiteral("dbfrom"), m_dbname);
0419     q.addQueryItem(QStringLiteral("id"),     QString::number(id));
0420     if(!m_apiKey.isEmpty()) {
0421       q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0422     }
0423     link.setQuery(q);
0424 
0425     markTime();
0426     QDomDocument linkDom = FileHandler::readXMLDocument(link, false /* namespace */, true /* quiet */);
0427     // need eLinkResult/LinkSet/IdUrlList/IdUrlSet/ObjUrl/Url
0428     QDomNode linkNode = linkDom.namedItem(QStringLiteral("eLinkResult"))
0429                                .namedItem(QStringLiteral("LinkSet"))
0430                                .namedItem(QStringLiteral("IdUrlList"))
0431                                .namedItem(QStringLiteral("IdUrlSet"))
0432                                .namedItem(QStringLiteral("ObjUrl"))
0433                                .namedItem(QStringLiteral("Url"));
0434     if(!linkNode.isNull()) {
0435       QString u = linkNode.toElement().text();
0436 //      myDebug() << u;
0437       if(!u.isEmpty()) {
0438         if(!coll->hasField(QStringLiteral("url"))) {
0439           Data::FieldPtr field(new Data::Field(QStringLiteral("url"), i18n("URL"), Data::Field::URL));
0440           field->setCategory(i18n("Miscellaneous"));
0441           coll->addField(field);
0442         }
0443         e->setField(QStringLiteral("url"), u);
0444       }
0445     }
0446   }
0447 
0448   m_entries.insert(uid_, e);
0449   return e;
0450 }
0451 
0452 void EntrezFetcher::initXSLTHandler() {
0453   QString xsltfile = DataFileRegistry::self()->locate(QStringLiteral("pubmed2tellico.xsl"));
0454   if(xsltfile.isEmpty()) {
0455     myWarning() << "can not locate pubmed2tellico.xsl.";
0456     return;
0457   }
0458 
0459   QUrl u = QUrl::fromLocalFile(xsltfile);
0460 
0461   if(!m_xsltHandler) {
0462     m_xsltHandler = new XSLTHandler(u);
0463   }
0464   if(!m_xsltHandler->isValid()) {
0465     myWarning() << "error in pubmed2tellico.xsl.";
0466     delete m_xsltHandler;
0467     m_xsltHandler = nullptr;
0468     return;
0469   }
0470 }
0471 
0472 // without an API key, limit is 3 searches per second
0473 // with a key, limit is 10
0474 // https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
0475 void EntrezFetcher::markTime() {
0476   // not exactly the way to monitor rate over 3 or 10 calls, just a constant rate
0477   const int wait = m_apiKey.isEmpty() ? 350 : 110;
0478   while(m_idleTime.elapsed() < wait) {
0479     QThread::msleep(100);
0480   }
0481   m_idleTime.restart();
0482 }
0483 
0484 Tellico::Fetch::FetchRequest EntrezFetcher::updateRequest(Data::EntryPtr entry_) {
0485   QString s = entry_->field(QStringLiteral("pmid"));
0486   if(!s.isEmpty()) {
0487     return FetchRequest(PubmedID, s);
0488   }
0489 
0490   s = entry_->field(QStringLiteral("doi"));
0491   if(!s.isEmpty()) {
0492     return FetchRequest(DOI, s);
0493   }
0494 
0495   s = entry_->field(QStringLiteral("title"));
0496   if(!s.isEmpty()) {
0497     return FetchRequest(Title, s);
0498   }
0499   return FetchRequest();
0500 }
0501 
0502 QString EntrezFetcher::defaultName() {
0503   return i18n("Entrez Database");
0504 }
0505 
0506 QString EntrezFetcher::defaultIcon() {
0507   return favIcon("http://www.ncbi.nlm.nih.gov");
0508 }
0509 
0510 //static
0511 Tellico::StringHash EntrezFetcher::allOptionalFields() {
0512   StringHash hash;
0513   hash[QStringLiteral("institution")] = i18n("Institution");
0514   hash[QStringLiteral("abstract")]    = i18n("Abstract");
0515   hash[QStringLiteral("url")]         = i18n("URL");
0516   return hash;
0517 }
0518 
0519 Tellico::Fetch::ConfigWidget* EntrezFetcher::configWidget(QWidget* parent_) const {
0520   return new EntrezFetcher::ConfigWidget(parent_, this);
0521 }
0522 
0523 EntrezFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const EntrezFetcher* fetcher_/*=0*/)
0524     : Fetch::ConfigWidget(parent_) {
0525   QGridLayout* l = new QGridLayout(optionsWidget());
0526   l->setSpacing(4);
0527   l->setColumnStretch(1, 10);
0528 
0529   int row = -1;
0530 
0531   QLabel* label = new QLabel(i18n("Access key: "), optionsWidget());
0532   l->addWidget(label, ++row, 0);
0533 
0534   m_apiKeyEdit = new QLineEdit(optionsWidget());
0535   connect(m_apiKeyEdit, &QLineEdit::textChanged, this, &ConfigWidget::slotSetModified);
0536   l->addWidget(m_apiKeyEdit, row, 1);
0537   QString w = i18n("The default Tellico key may be used, but searching may fail due to reaching access limits.");
0538   label->setWhatsThis(w);
0539   m_apiKeyEdit->setWhatsThis(w);
0540   label->setBuddy(m_apiKeyEdit);
0541 
0542   l->setRowStretch(++row, 10);
0543 
0544   // now add additional fields widget
0545   addFieldsWidget(EntrezFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList());
0546 
0547   if(fetcher_) {
0548     m_apiKeyEdit->setText(fetcher_->m_apiKey);
0549   }
0550 }
0551 
0552 void EntrezFetcher::ConfigWidget::saveConfigHook(KConfigGroup& config_) {
0553   QString apiKey = m_apiKeyEdit->text().trimmed();
0554   if(!apiKey.isEmpty()) {
0555     config_.writeEntry("API Key", apiKey);
0556   }
0557 }
0558 
0559 QString EntrezFetcher::ConfigWidget::preferredName() const {
0560   return EntrezFetcher::defaultName();
0561 }