File indexing completed on 2024-05-12 05:09:31

0001 /***************************************************************************
0002     Copyright (C) 2005-2020 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "entrezfetcher.h"
0026 #include "../utils/guiproxy.h"
0027 #include "../collection.h"
0028 #include "../entry.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../translators/xslthandler.h"
0032 #include "../translators/tellicoimporter.h"
0033 #include "../utils/datafileregistry.h"
0034 #include "../tellico_debug.h"
0035 
0036 #include <KLocalizedString>
0037 #include <KIO/Job>
0038 #include <KIO/JobUiDelegate>
0039 #include <KConfigGroup>
0040 #include <KJobWidgets/KJobWidgets>
0041 
0042 #include <QDomDocument>
0043 #include <QLabel>
0044 #include <QFile>
0045 #include <QTextStream>
0046 #include <QGridLayout>
0047 #include <QLineEdit>
0048 #include <QUrlQuery>
0049 #include <QThread>
0050 #include <QJsonDocument>
0051 #include <QJsonObject>
0052 
0053 namespace {
0054   static const int ENTREZ_MAX_RETURNS_TOTAL = 25;
0055   static const char* ENTREZ_BASE_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/";
0056   static const char* ENTREZ_SEARCH_CGI = "esearch.fcgi";
0057   static const char* ENTREZ_SUMMARY_CGI = "esummary.fcgi";
0058   static const char* ENTREZ_FETCH_CGI = "efetch.fcgi";
0059   static const char* ENTREZ_LINK_CGI = "elink.fcgi";
0060   static const char* ENTREZ_DEFAULT_DATABASE = "pubmed";
0061 }
0062 
0063 using namespace Tellico;
0064 using namespace Tellico::Fetch;
0065 using Tellico::Fetch::EntrezFetcher;
0066 
0067 EntrezFetcher::EntrezFetcher(QObject* parent_) : Fetcher(parent_), m_xsltHandler(nullptr),
0068     m_start(1), m_total(-1), m_step(Step::Begin), m_started(false) {
0069   m_idleTime.start();
0070 }
0071 
0072 EntrezFetcher::~EntrezFetcher() {
0073 }
0074 
0075 QString EntrezFetcher::source() const {
0076   return m_name.isEmpty() ? defaultName() : m_name;
0077 }
0078 
0079 bool EntrezFetcher::canSearch(Fetch::FetchKey k) const {
0080   return k == Title || k == Person || k == Keyword || k == Raw || k == PubmedID || k == DOI;
0081 }
0082 
0083 bool EntrezFetcher::canFetch(int type) const {
0084   return type == Data::Collection::Bibtex;
0085 }
0086 
0087 void EntrezFetcher::readConfigHook(const KConfigGroup& config_) {
0088   QString s = config_.readEntry("Database", ENTREZ_DEFAULT_DATABASE); // default to pubmed
0089   if(!s.isEmpty()) {
0090     m_dbname = s;
0091   }
0092   QString k = config_.readEntry("API Key");
0093   if(!k.isEmpty()) {
0094     m_apiKey = k;
0095   }
0096 }
0097 
0098 void EntrezFetcher::search() {
0099   m_started = true;
0100   m_start = 1;
0101   m_total = -1;
0102 
0103   if(m_dbname.isEmpty()) {
0104     m_dbname = QLatin1String(ENTREZ_DEFAULT_DATABASE);
0105   }
0106 
0107   QUrl u(QString::fromLatin1(ENTREZ_BASE_URL));
0108   u.setPath(u.path() + QLatin1String(ENTREZ_SEARCH_CGI));
0109   QUrlQuery q;
0110   q.addQueryItem(QStringLiteral("tool"),       QStringLiteral("Tellico"));
0111   q.addQueryItem(QStringLiteral("retmode"),    QStringLiteral("xml"));
0112   q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y"));
0113   q.addQueryItem(QStringLiteral("retmax"),     QStringLiteral("1")); // we're just getting the count
0114   q.addQueryItem(QStringLiteral("db"),         m_dbname);
0115   q.addQueryItem(QStringLiteral("term"),       request().value());
0116   switch(request().key()) {
0117     case Title:
0118       q.addQueryItem(QStringLiteral("field"), QStringLiteral("titl"));
0119       break;
0120 
0121     case Person:
0122       q.addQueryItem(QStringLiteral("field"), QStringLiteral("auth"));
0123       break;
0124 
0125     case Keyword:
0126       // for Tellico Keyword searches basically mean search for any field matching
0127 //      q.addQueryItem(QLatin1String("field"), QLatin1String("word"));
0128       break;
0129 
0130     case PubmedID:
0131       q.addQueryItem(QStringLiteral("field"), QStringLiteral("pmid"));
0132       break;
0133 
0134     case DOI:
0135     case Raw:
0136       // for DOI, enough to match any field to DOI value
0137       //q.setQuery(u.query() + QLatin1Char('&') + request().value());
0138       break;
0139 
0140     default:
0141       myWarning() << source() << "- key not recognized:" << request().key();
0142       stop();
0143       return;
0144   }
0145   if(!m_apiKey.isEmpty()) {
0146     q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0147   }
0148   u.setQuery(q);
0149 
0150   m_step = Step::Search;
0151 //  myLog() << "search url: " << u.url();
0152   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0153   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0154   connect(m_job.data(), &KJob::result,
0155           this, &EntrezFetcher::slotComplete);
0156   markTime();
0157 }
0158 
0159 void EntrezFetcher::continueSearch() {
0160   m_started = true;
0161   doSummary();
0162 }
0163 
0164 void EntrezFetcher::stop() {
0165   if(!m_started) {
0166     return;
0167   }
0168   if(m_job) {
0169     m_job->kill();
0170     m_job = nullptr;
0171   }
0172   m_started = false;
0173   m_step = Step::Begin;
0174   emit signalDone(this);
0175 }
0176 
0177 void EntrezFetcher::slotComplete(KJob*) {
0178   Q_ASSERT(m_job);
0179   if(m_job->error()) {
0180     m_job->uiDelegate()->showErrorMessage();
0181     stop();
0182     return;
0183   }
0184 
0185   QByteArray data = m_job->data();
0186   if(data.isEmpty()) {
0187     myDebug() << "no data";
0188     stop();
0189     return;
0190   }
0191   // see bug 319662. If fetcher is cancelled, job is killed
0192   // if the pointer is retained, it gets double-deleted
0193   m_job = nullptr;
0194 
0195 #if 0
0196   myWarning() << "Remove debug from entrezfetcher.cpp: " << __LINE__;
0197   QFile f(QLatin1String("/tmp/test.xml"));
0198   if(f.open(QIODevice::WriteOnly)) {
0199     QTextStream t(&f);
0200     t.setCodec("UTF-8");
0201     t << data;
0202   }
0203   f.close();
0204 #endif
0205 
0206   switch(m_step) {
0207     case Step::Search:
0208       searchResults(data);
0209       break;
0210     case Step::Summary:
0211       summaryResults(data);
0212       break;
0213     case Step::Begin:
0214     case Step::Fetch:
0215     default:
0216       myLog() << "wrong step =" << int(m_step);
0217       stop();
0218       break;
0219   }
0220 }
0221 
0222 void EntrezFetcher::searchResults(const QByteArray& data_) {
0223   QDomDocument dom;
0224   if(!dom.setContent(data_, false)) {
0225     myWarning() << "server did not return valid XML.";
0226     stop();
0227     return;
0228   }
0229   // find Count, QueryKey, and WebEnv elements
0230   int count = 0;
0231   for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) {
0232     QDomElement e = n.toElement();
0233     if(e.isNull()) {
0234       continue;
0235     }
0236     if(e.tagName() == QLatin1String("Count")) {
0237       m_total = e.text().toInt();
0238       ++count;
0239     } else if(e.tagName() == QLatin1String("QueryKey")) {
0240       m_queryKey = e.text();
0241       ++count;
0242     } else if(e.tagName() == QLatin1String("WebEnv")) {
0243       m_webEnv = e.text();
0244       ++count;
0245     }
0246     if(count >= 3) {
0247       break; // found them all
0248     }
0249   }
0250 
0251   doSummary();
0252 }
0253 
0254 void EntrezFetcher::doSummary() {
0255   QUrl u(QString::fromLatin1(ENTREZ_BASE_URL));
0256   u.setPath(u.path() + QLatin1String(ENTREZ_SUMMARY_CGI));
0257   QUrlQuery q;
0258   q.addQueryItem(QStringLiteral("tool"),       QStringLiteral("Tellico"));
0259   q.addQueryItem(QStringLiteral("retmode"),    QStringLiteral("xml"));
0260   if(m_start > 1) {
0261     q.addQueryItem(QStringLiteral("retstart"),   QString::number(m_start));
0262   }
0263   q.addQueryItem(QStringLiteral("retmax"),     QString::number(qMin(m_total-m_start-1, ENTREZ_MAX_RETURNS_TOTAL)));
0264   q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y"));
0265   q.addQueryItem(QStringLiteral("db"),         m_dbname);
0266   q.addQueryItem(QStringLiteral("query_key"),  m_queryKey);
0267   q.addQueryItem(QStringLiteral("WebEnv"),     m_webEnv);
0268   if(!m_apiKey.isEmpty()) {
0269     q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0270   }
0271   u.setQuery(q);
0272 
0273   m_step = Step::Summary;
0274 //  myLog() << "summary url:" << u.url();
0275   m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo);
0276   KJobWidgets::setWindow(m_job, GUI::Proxy::widget());
0277   connect(m_job.data(), &KJob::result,
0278           this, &EntrezFetcher::slotComplete);
0279   markTime();
0280 }
0281 
0282 void EntrezFetcher::summaryResults(const QByteArray& data_) {
0283   QDomDocument dom;
0284   if(!dom.setContent(data_, false)) {
0285     myWarning() << "server did not return valid XML.";
0286     stop();
0287     return;
0288   }
0289   // top child is eSummaryResult
0290   // all children are DocSum
0291   for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) {
0292     QDomElement e = n.toElement();
0293     if(e.isNull() || e.tagName() != QLatin1String("DocSum")) {
0294       continue;
0295     }
0296     QDomNodeList nodes = e.elementsByTagName(QStringLiteral("Id"));
0297     if(nodes.count() == 0) {
0298       myDebug() << "no Id elements";
0299       continue;
0300     }
0301     int id = nodes.item(0).toElement().text().toInt();
0302     QString title, pubdate, authors;
0303     nodes = e.elementsByTagName(QStringLiteral("Item"));
0304     for(int j = 0; j < nodes.count(); ++j) {
0305       const auto elem = nodes.item(j).toElement();
0306       if(elem.attribute(QStringLiteral("Name")) == QLatin1String("Title")) {
0307         title = elem.text();
0308       } else if(elem.attribute(QStringLiteral("Name")) == QLatin1String("PubDate")) {
0309         pubdate = elem.text();
0310       } else if(elem.attribute(QStringLiteral("Name")) == QLatin1String("AuthorList")) {
0311         QStringList list;
0312         for(QDomNode aNode = nodes.item(j).firstChild(); !aNode.isNull(); aNode = aNode.nextSibling()) {
0313           // lazy, assume all children Items are authors
0314           if(aNode.nodeName() == QLatin1String("Item")) {
0315             list << aNode.toElement().text();
0316           }
0317         }
0318         authors = list.join(FieldFormat::delimiterString());
0319       }
0320       if(!title.isEmpty() && !pubdate.isEmpty() && !authors.isEmpty()) {
0321         break; // done now
0322       }
0323     }
0324     FetchResult* r = new FetchResult(this, title, pubdate + QLatin1Char('/') + authors);
0325     m_matches.insert(r->uid, id);
0326     emit signalResultFound(r);
0327   }
0328   m_start = m_matches.count() + 1;
0329   m_hasMoreResults = m_start <= m_total;
0330   stop(); // done searching
0331 }
0332 
0333 Tellico::Data::EntryPtr EntrezFetcher::fetchEntryHook(uint uid_) {
0334   // if we already grabbed this one, then just pull it out of the dict
0335   Data::EntryPtr entry = m_entries[uid_];
0336   if(entry) {
0337     return entry;
0338   }
0339 
0340   if(!m_matches.contains(uid_)) {
0341     return Data::EntryPtr();
0342   }
0343 
0344   if(!m_xsltHandler) {
0345     initXSLTHandler();
0346     if(!m_xsltHandler) { // probably an error somewhere in the stylesheet loading
0347       stop();
0348       return Data::EntryPtr();
0349     }
0350   }
0351 
0352   int id = m_matches[uid_];
0353 
0354   QUrl u(QString::fromLatin1(ENTREZ_BASE_URL));
0355   u.setPath(u.path() + QLatin1String(ENTREZ_FETCH_CGI));
0356   QUrlQuery q;
0357   q.addQueryItem(QStringLiteral("tool"),       QStringLiteral("Tellico"));
0358   q.addQueryItem(QStringLiteral("retmode"),    QStringLiteral("xml"));
0359   q.addQueryItem(QStringLiteral("rettype"),    QStringLiteral("abstract"));
0360   q.addQueryItem(QStringLiteral("db"),         m_dbname);
0361   q.addQueryItem(QStringLiteral("id"),         QString::number(id));
0362   if(!m_apiKey.isEmpty()) {
0363     q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0364   }
0365   u.setQuery(q);
0366 
0367   // now it's synchronous
0368 //  myDebug() << "id url:" << u.url();
0369   markTime();
0370   QString xmlOutput = FileHandler::readXMLFile(u, true /*quiet*/);
0371   if(xmlOutput.isEmpty()) {
0372     myWarning() << "unable to download " << u;
0373     return Data::EntryPtr();
0374   }
0375 #if 0
0376   myWarning() << "turn me off in entrezfetcher.cpp!";
0377   QFile f1(QLatin1String("/tmp/test-entry.xml"));
0378   if(f1.open(QIODevice::WriteOnly)) {
0379     QTextStream t(&f1);
0380     t.setCodec("UTF-8");
0381     t << xmlOutput;
0382   }
0383   f1.close();
0384 #endif
0385   QString str = m_xsltHandler->applyStylesheet(xmlOutput);
0386   if(str.isEmpty()) {
0387     // might be an API error, and message is in JSON
0388     QJsonDocument doc = QJsonDocument::fromJson(xmlOutput.toUtf8());
0389     if(!doc.isNull() && doc.object().contains(QStringLiteral("error"))) {
0390       const QString error = doc.object().value(QStringLiteral("error")).toString();
0391       message(error, MessageHandler::Error);
0392       myLog() << "EntrezFetcher -" << error;
0393     }
0394     return Data::EntryPtr();
0395   }
0396   Import::TellicoImporter imp(str);
0397   Data::CollPtr coll = imp.collection();
0398   if(!coll) {
0399     myWarning() << "invalid collection";
0400     return Data::EntryPtr();
0401   }
0402   if(coll->entryCount() == 0) {
0403     myDebug() << "no entries in collection";
0404     return Data::EntryPtr();
0405   } else if(coll->entryCount() > 1) {
0406     myDebug() << "collection has multiple entries, taking first one";
0407   }
0408 
0409   Data::EntryPtr e = coll->entries().front();
0410 
0411   // try to get a link, but only if necessary
0412   if(optionalFields().contains(QStringLiteral("url"))) {
0413     QUrl link(QString::fromLatin1(ENTREZ_BASE_URL));
0414     link.setPath(link.path() + QLatin1String(ENTREZ_LINK_CGI));
0415     QUrlQuery q;
0416     q.addQueryItem(QStringLiteral("tool"),   QStringLiteral("Tellico"));
0417     q.addQueryItem(QStringLiteral("cmd"),    QStringLiteral("llinks"));
0418     q.addQueryItem(QStringLiteral("db"),     m_dbname);
0419     q.addQueryItem(QStringLiteral("dbfrom"), m_dbname);
0420     q.addQueryItem(QStringLiteral("id"),     QString::number(id));
0421     if(!m_apiKey.isEmpty()) {
0422       q.addQueryItem(QStringLiteral("api_key"), m_apiKey);
0423     }
0424     link.setQuery(q);
0425 
0426     markTime();
0427     QDomDocument linkDom = FileHandler::readXMLDocument(link, false /* namespace */, true /* quiet */);
0428     // need eLinkResult/LinkSet/IdUrlList/IdUrlSet/ObjUrl/Url
0429     QDomNode linkNode = linkDom.namedItem(QStringLiteral("eLinkResult"))
0430                                .namedItem(QStringLiteral("LinkSet"))
0431                                .namedItem(QStringLiteral("IdUrlList"))
0432                                .namedItem(QStringLiteral("IdUrlSet"))
0433                                .namedItem(QStringLiteral("ObjUrl"))
0434                                .namedItem(QStringLiteral("Url"));
0435     if(!linkNode.isNull()) {
0436       QString u = linkNode.toElement().text();
0437 //      myDebug() << u;
0438       if(!u.isEmpty()) {
0439         if(!coll->hasField(QStringLiteral("url"))) {
0440           Data::FieldPtr field(new Data::Field(QStringLiteral("url"), i18n("URL"), Data::Field::URL));
0441           field->setCategory(i18n("Miscellaneous"));
0442           coll->addField(field);
0443         }
0444         e->setField(QStringLiteral("url"), u);
0445       }
0446     }
0447   }
0448 
0449   m_entries.insert(uid_, e);
0450   return e;
0451 }
0452 
0453 void EntrezFetcher::initXSLTHandler() {
0454   QString xsltfile = DataFileRegistry::self()->locate(QStringLiteral("pubmed2tellico.xsl"));
0455   if(xsltfile.isEmpty()) {
0456     myWarning() << "can not locate pubmed2tellico.xsl.";
0457     return;
0458   }
0459 
0460   QUrl u = QUrl::fromLocalFile(xsltfile);
0461 
0462   if(!m_xsltHandler) {
0463     m_xsltHandler = new XSLTHandler(u);
0464   }
0465   if(!m_xsltHandler->isValid()) {
0466     myWarning() << "error in pubmed2tellico.xsl.";
0467     delete m_xsltHandler;
0468     m_xsltHandler = nullptr;
0469     return;
0470   }
0471 }
0472 
0473 // without an API key, limit is 3 searches per second
0474 // with a key, limit is 10
0475 // https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
0476 void EntrezFetcher::markTime() {
0477   // not exactly the way to monitor rate over 3 or 10 calls, just a constant rate
0478   const int wait = m_apiKey.isEmpty() ? 350 : 110;
0479   while(m_idleTime.elapsed() < wait) {
0480     QThread::msleep(100);
0481   }
0482   m_idleTime.restart();
0483 }
0484 
0485 Tellico::Fetch::FetchRequest EntrezFetcher::updateRequest(Data::EntryPtr entry_) {
0486   QString s = entry_->field(QStringLiteral("pmid"));
0487   if(!s.isEmpty()) {
0488     return FetchRequest(PubmedID, s);
0489   }
0490 
0491   s = entry_->field(QStringLiteral("doi"));
0492   if(!s.isEmpty()) {
0493     return FetchRequest(DOI, s);
0494   }
0495 
0496   s = entry_->field(QStringLiteral("title"));
0497   if(!s.isEmpty()) {
0498     return FetchRequest(Title, s);
0499   }
0500   return FetchRequest();
0501 }
0502 
0503 QString EntrezFetcher::defaultName() {
0504   return i18n("Entrez Database");
0505 }
0506 
0507 QString EntrezFetcher::defaultIcon() {
0508   return favIcon("http://www.ncbi.nlm.nih.gov");
0509 }
0510 
0511 //static
0512 Tellico::StringHash EntrezFetcher::allOptionalFields() {
0513   StringHash hash;
0514   hash[QStringLiteral("institution")] = i18n("Institution");
0515   hash[QStringLiteral("abstract")]    = i18n("Abstract");
0516   hash[QStringLiteral("url")]         = i18n("URL");
0517   return hash;
0518 }
0519 
0520 Tellico::Fetch::ConfigWidget* EntrezFetcher::configWidget(QWidget* parent_) const {
0521   return new EntrezFetcher::ConfigWidget(parent_, this);
0522 }
0523 
0524 EntrezFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const EntrezFetcher* fetcher_/*=0*/)
0525     : Fetch::ConfigWidget(parent_) {
0526   QGridLayout* l = new QGridLayout(optionsWidget());
0527   l->setSpacing(4);
0528   l->setColumnStretch(1, 10);
0529 
0530   int row = -1;
0531 
0532   QLabel* label = new QLabel(i18n("Access key: "), optionsWidget());
0533   l->addWidget(label, ++row, 0);
0534 
0535   m_apiKeyEdit = new QLineEdit(optionsWidget());
0536   connect(m_apiKeyEdit, &QLineEdit::textChanged, this, &ConfigWidget::slotSetModified);
0537   l->addWidget(m_apiKeyEdit, row, 1);
0538   QString w = i18n("The default Tellico key may be used, but searching may fail due to reaching access limits.");
0539   label->setWhatsThis(w);
0540   m_apiKeyEdit->setWhatsThis(w);
0541   label->setBuddy(m_apiKeyEdit);
0542 
0543   l->setRowStretch(++row, 10);
0544 
0545   // now add additional fields widget
0546   addFieldsWidget(EntrezFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList());
0547 
0548   if(fetcher_) {
0549     m_apiKeyEdit->setText(fetcher_->m_apiKey);
0550   }
0551 }
0552 
0553 void EntrezFetcher::ConfigWidget::saveConfigHook(KConfigGroup& config_) {
0554   QString apiKey = m_apiKeyEdit->text().trimmed();
0555   if(!apiKey.isEmpty()) {
0556     config_.writeEntry("API Key", apiKey);
0557   }
0558 }
0559 
0560 QString EntrezFetcher::ConfigWidget::preferredName() const {
0561   return EntrezFetcher::defaultName();
0562 }