File indexing completed on 2024-05-19 16:18:42
0001 /*************************************************************************** 0002 Copyright (C) 2005-2020 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "entrezfetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../collection.h" 0028 #include "../entry.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../translators/xslthandler.h" 0032 #include "../translators/tellicoimporter.h" 0033 #include "../utils/datafileregistry.h" 0034 #include "../tellico_debug.h" 0035 0036 #include <KLocalizedString> 0037 #include <KIO/Job> 0038 #include <KIO/JobUiDelegate> 0039 #include <KConfigGroup> 0040 #include <KJobWidgets/KJobWidgets> 0041 0042 #include <QDomDocument> 0043 #include <QLabel> 0044 #include <QFile> 0045 #include <QTextStream> 0046 #include <QGridLayout> 0047 #include <QLineEdit> 0048 #include <QUrlQuery> 0049 #include <QThread> 0050 #include <QJsonDocument> 0051 #include <QJsonObject> 0052 0053 namespace { 0054 static const int ENTREZ_MAX_RETURNS_TOTAL = 25; 0055 static const char* ENTREZ_BASE_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; 0056 static const char* ENTREZ_SEARCH_CGI = "esearch.fcgi"; 0057 static const char* ENTREZ_SUMMARY_CGI = "esummary.fcgi"; 0058 static const char* ENTREZ_FETCH_CGI = "efetch.fcgi"; 0059 static const char* ENTREZ_LINK_CGI = "elink.fcgi"; 0060 static const char* ENTREZ_DEFAULT_DATABASE = "pubmed"; 0061 } 0062 0063 using namespace Tellico; 0064 using namespace Tellico::Fetch; 0065 using Tellico::Fetch::EntrezFetcher; 0066 0067 EntrezFetcher::EntrezFetcher(QObject* parent_) : Fetcher(parent_), m_xsltHandler(nullptr), 0068 m_start(1), m_total(-1), m_step(Begin), m_started(false) { 0069 m_idleTime.start(); 0070 } 0071 0072 EntrezFetcher::~EntrezFetcher() { 0073 } 0074 0075 QString EntrezFetcher::source() const { 0076 return m_name.isEmpty() ? defaultName() : m_name; 0077 } 0078 0079 bool EntrezFetcher::canSearch(Fetch::FetchKey k) const { 0080 return k == Title || k == Person || k == Keyword || k == Raw || k == PubmedID || k == DOI; 0081 } 0082 0083 bool EntrezFetcher::canFetch(int type) const { 0084 return type == Data::Collection::Bibtex; 0085 } 0086 0087 void EntrezFetcher::readConfigHook(const KConfigGroup& config_) { 0088 QString s = config_.readEntry("Database", ENTREZ_DEFAULT_DATABASE); // default to pubmed 0089 if(!s.isEmpty()) { 0090 m_dbname = s; 0091 } 0092 QString k = config_.readEntry("API Key"); 0093 if(!k.isEmpty()) { 0094 m_apiKey = k; 0095 } 0096 } 0097 0098 void EntrezFetcher::search() { 0099 m_started = true; 0100 m_start = 1; 0101 m_total = -1; 0102 0103 if(m_dbname.isEmpty()) { 0104 m_dbname = QLatin1String(ENTREZ_DEFAULT_DATABASE); 0105 } 0106 0107 QUrl u(QString::fromLatin1(ENTREZ_BASE_URL)); 0108 u.setPath(u.path() + QLatin1String(ENTREZ_SEARCH_CGI)); 0109 QUrlQuery q; 0110 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0111 q.addQueryItem(QStringLiteral("retmode"), QStringLiteral("xml")); 0112 q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y")); 0113 q.addQueryItem(QStringLiteral("retmax"), QStringLiteral("1")); // we're just getting the count 0114 q.addQueryItem(QStringLiteral("db"), m_dbname); 0115 q.addQueryItem(QStringLiteral("term"), request().value()); 0116 switch(request().key()) { 0117 case Title: 0118 q.addQueryItem(QStringLiteral("field"), QStringLiteral("titl")); 0119 break; 0120 0121 case Person: 0122 q.addQueryItem(QStringLiteral("field"), QStringLiteral("auth")); 0123 break; 0124 0125 case Keyword: 0126 // for Tellico Keyword searches basically mean search for any field matching 0127 // q.addQueryItem(QLatin1String("field"), QLatin1String("word")); 0128 break; 0129 0130 case PubmedID: 0131 q.addQueryItem(QStringLiteral("field"), QStringLiteral("pmid")); 0132 break; 0133 0134 case DOI: 0135 case Raw: 0136 // for DOI, enough to match any field to DOI value 0137 //q.setQuery(u.query() + QLatin1Char('&') + request().value()); 0138 break; 0139 0140 default: 0141 myWarning() << "key not supported:" << request().key(); 0142 stop(); 0143 return; 0144 } 0145 if(!m_apiKey.isEmpty()) { 0146 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0147 } 0148 u.setQuery(q); 0149 0150 m_step = Search; 0151 // myLog() << "search url: " << u.url(); 0152 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0153 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0154 connect(m_job.data(), &KJob::result, 0155 this, &EntrezFetcher::slotComplete); 0156 markTime(); 0157 } 0158 0159 void EntrezFetcher::continueSearch() { 0160 m_started = true; 0161 doSummary(); 0162 } 0163 0164 void EntrezFetcher::stop() { 0165 if(!m_started) { 0166 return; 0167 } 0168 if(m_job) { 0169 m_job->kill(); 0170 m_job = nullptr; 0171 } 0172 m_started = false; 0173 m_step = Begin; 0174 emit signalDone(this); 0175 } 0176 0177 void EntrezFetcher::slotComplete(KJob*) { 0178 Q_ASSERT(m_job); 0179 if(m_job->error()) { 0180 m_job->uiDelegate()->showErrorMessage(); 0181 stop(); 0182 return; 0183 } 0184 0185 QByteArray data = m_job->data(); 0186 if(data.isEmpty()) { 0187 myDebug() << "no data"; 0188 stop(); 0189 return; 0190 } 0191 // see bug 319662. If fetcher is cancelled, job is killed 0192 // if the pointer is retained, it gets double-deleted 0193 m_job = nullptr; 0194 0195 #if 0 0196 myWarning() << "Remove debug from entrezfetcher.cpp: " << __LINE__; 0197 QFile f(QLatin1String("/tmp/test.xml")); 0198 if(f.open(QIODevice::WriteOnly)) { 0199 QTextStream t(&f); 0200 t.setCodec("UTF-8"); 0201 t << data; 0202 } 0203 f.close(); 0204 #endif 0205 0206 switch(m_step) { 0207 case Search: 0208 searchResults(data); 0209 break; 0210 case Summary: 0211 summaryResults(data); 0212 break; 0213 case Begin: 0214 case Fetch: 0215 default: 0216 myLog() << "wrong step =" << m_step; 0217 stop(); 0218 break; 0219 } 0220 } 0221 0222 void EntrezFetcher::searchResults(const QByteArray& data_) { 0223 QDomDocument dom; 0224 if(!dom.setContent(data_, false)) { 0225 myWarning() << "server did not return valid XML."; 0226 stop(); 0227 return; 0228 } 0229 // find Count, QueryKey, and WebEnv elements 0230 int count = 0; 0231 for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) { 0232 QDomElement e = n.toElement(); 0233 if(e.isNull()) { 0234 continue; 0235 } 0236 if(e.tagName() == QLatin1String("Count")) { 0237 m_total = e.text().toInt(); 0238 ++count; 0239 } else if(e.tagName() == QLatin1String("QueryKey")) { 0240 m_queryKey = e.text(); 0241 ++count; 0242 } else if(e.tagName() == QLatin1String("WebEnv")) { 0243 m_webEnv = e.text(); 0244 ++count; 0245 } 0246 if(count >= 3) { 0247 break; // found them all 0248 } 0249 } 0250 0251 doSummary(); 0252 } 0253 0254 void EntrezFetcher::doSummary() { 0255 QUrl u(QString::fromLatin1(ENTREZ_BASE_URL)); 0256 u.setPath(u.path() + QLatin1String(ENTREZ_SUMMARY_CGI)); 0257 QUrlQuery q; 0258 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0259 q.addQueryItem(QStringLiteral("retmode"), QStringLiteral("xml")); 0260 if(m_start > 1) { 0261 q.addQueryItem(QStringLiteral("retstart"), QString::number(m_start)); 0262 } 0263 q.addQueryItem(QStringLiteral("retmax"), QString::number(qMin(m_total-m_start-1, ENTREZ_MAX_RETURNS_TOTAL))); 0264 q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y")); 0265 q.addQueryItem(QStringLiteral("db"), m_dbname); 0266 q.addQueryItem(QStringLiteral("query_key"), m_queryKey); 0267 q.addQueryItem(QStringLiteral("WebEnv"), m_webEnv); 0268 if(!m_apiKey.isEmpty()) { 0269 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0270 } 0271 u.setQuery(q); 0272 0273 m_step = Summary; 0274 // myLog() << "summary url:" << u.url(); 0275 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0276 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0277 connect(m_job.data(), &KJob::result, 0278 this, &EntrezFetcher::slotComplete); 0279 markTime(); 0280 } 0281 0282 void EntrezFetcher::summaryResults(const QByteArray& data_) { 0283 QDomDocument dom; 0284 if(!dom.setContent(data_, false)) { 0285 myWarning() << "server did not return valid XML."; 0286 stop(); 0287 return; 0288 } 0289 // top child is eSummaryResult 0290 // all children are DocSum 0291 for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) { 0292 QDomElement e = n.toElement(); 0293 if(e.isNull() || e.tagName() != QLatin1String("DocSum")) { 0294 continue; 0295 } 0296 QDomNodeList nodes = e.elementsByTagName(QStringLiteral("Id")); 0297 if(nodes.count() == 0) { 0298 myDebug() << "no Id elements"; 0299 continue; 0300 } 0301 int id = nodes.item(0).toElement().text().toInt(); 0302 QString title, pubdate, authors; 0303 nodes = e.elementsByTagName(QStringLiteral("Item")); 0304 for(int j = 0; j < nodes.count(); ++j) { 0305 if(nodes.item(j).toElement().attribute(QStringLiteral("Name")) == QLatin1String("Title")) { 0306 title = nodes.item(j).toElement().text(); 0307 } else if(nodes.item(j).toElement().attribute(QStringLiteral("Name")) == QLatin1String("PubDate")) { 0308 pubdate = nodes.item(j).toElement().text(); 0309 } else if(nodes.item(j).toElement().attribute(QStringLiteral("Name")) == QLatin1String("AuthorList")) { 0310 QStringList list; 0311 for(QDomNode aNode = nodes.item(j).firstChild(); !aNode.isNull(); aNode = aNode.nextSibling()) { 0312 // lazy, assume all children Items are authors 0313 if(aNode.nodeName() == QLatin1String("Item")) { 0314 list << aNode.toElement().text(); 0315 } 0316 } 0317 authors = list.join(FieldFormat::delimiterString()); 0318 } 0319 if(!title.isEmpty() && !pubdate.isEmpty() && !authors.isEmpty()) { 0320 break; // done now 0321 } 0322 } 0323 FetchResult* r = new FetchResult(this, title, pubdate + QLatin1Char('/') + authors); 0324 m_matches.insert(r->uid, id); 0325 emit signalResultFound(r); 0326 } 0327 m_start = m_matches.count() + 1; 0328 m_hasMoreResults = m_start <= m_total; 0329 stop(); // done searching 0330 } 0331 0332 Tellico::Data::EntryPtr EntrezFetcher::fetchEntryHook(uint uid_) { 0333 // if we already grabbed this one, then just pull it out of the dict 0334 Data::EntryPtr entry = m_entries[uid_]; 0335 if(entry) { 0336 return entry; 0337 } 0338 0339 if(!m_matches.contains(uid_)) { 0340 return Data::EntryPtr(); 0341 } 0342 0343 if(!m_xsltHandler) { 0344 initXSLTHandler(); 0345 if(!m_xsltHandler) { // probably an error somewhere in the stylesheet loading 0346 stop(); 0347 return Data::EntryPtr(); 0348 } 0349 } 0350 0351 int id = m_matches[uid_]; 0352 0353 QUrl u(QString::fromLatin1(ENTREZ_BASE_URL)); 0354 u.setPath(u.path() + QLatin1String(ENTREZ_FETCH_CGI)); 0355 QUrlQuery q; 0356 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0357 q.addQueryItem(QStringLiteral("retmode"), QStringLiteral("xml")); 0358 q.addQueryItem(QStringLiteral("rettype"), QStringLiteral("abstract")); 0359 q.addQueryItem(QStringLiteral("db"), m_dbname); 0360 q.addQueryItem(QStringLiteral("id"), QString::number(id)); 0361 if(!m_apiKey.isEmpty()) { 0362 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0363 } 0364 u.setQuery(q); 0365 0366 // now it's synchronous 0367 // myDebug() << "id url:" << u.url(); 0368 markTime(); 0369 QString xmlOutput = FileHandler::readXMLFile(u, true /*quiet*/); 0370 if(xmlOutput.isEmpty()) { 0371 myWarning() << "unable to download " << u; 0372 return Data::EntryPtr(); 0373 } 0374 #if 0 0375 myWarning() << "turn me off in entrezfetcher.cpp!"; 0376 QFile f1(QLatin1String("/tmp/test-entry.xml")); 0377 if(f1.open(QIODevice::WriteOnly)) { 0378 QTextStream t(&f1); 0379 t.setCodec("UTF-8"); 0380 t << xmlOutput; 0381 } 0382 f1.close(); 0383 #endif 0384 QString str = m_xsltHandler->applyStylesheet(xmlOutput); 0385 if(str.isEmpty()) { 0386 // might be an API error, and message is in JSON 0387 QJsonDocument doc = QJsonDocument::fromJson(xmlOutput.toUtf8()); 0388 if(!doc.isNull() && doc.object().contains(QStringLiteral("error"))) { 0389 const QString error = doc.object().value(QStringLiteral("error")).toString(); 0390 message(error, MessageHandler::Error); 0391 myLog() << "EntrezFetcher -" << error; 0392 } 0393 return Data::EntryPtr(); 0394 } 0395 Import::TellicoImporter imp(str); 0396 Data::CollPtr coll = imp.collection(); 0397 if(!coll) { 0398 myWarning() << "invalid collection"; 0399 return Data::EntryPtr(); 0400 } 0401 if(coll->entryCount() == 0) { 0402 myDebug() << "no entries in collection"; 0403 return Data::EntryPtr(); 0404 } else if(coll->entryCount() > 1) { 0405 myDebug() << "collection has multiple entries, taking first one"; 0406 } 0407 0408 Data::EntryPtr e = coll->entries().front(); 0409 0410 // try to get a link, but only if necessary 0411 if(optionalFields().contains(QStringLiteral("url"))) { 0412 QUrl link(QString::fromLatin1(ENTREZ_BASE_URL)); 0413 link.setPath(link.path() + QLatin1String(ENTREZ_LINK_CGI)); 0414 QUrlQuery q; 0415 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0416 q.addQueryItem(QStringLiteral("cmd"), QStringLiteral("llinks")); 0417 q.addQueryItem(QStringLiteral("db"), m_dbname); 0418 q.addQueryItem(QStringLiteral("dbfrom"), m_dbname); 0419 q.addQueryItem(QStringLiteral("id"), QString::number(id)); 0420 if(!m_apiKey.isEmpty()) { 0421 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0422 } 0423 link.setQuery(q); 0424 0425 markTime(); 0426 QDomDocument linkDom = FileHandler::readXMLDocument(link, false /* namespace */, true /* quiet */); 0427 // need eLinkResult/LinkSet/IdUrlList/IdUrlSet/ObjUrl/Url 0428 QDomNode linkNode = linkDom.namedItem(QStringLiteral("eLinkResult")) 0429 .namedItem(QStringLiteral("LinkSet")) 0430 .namedItem(QStringLiteral("IdUrlList")) 0431 .namedItem(QStringLiteral("IdUrlSet")) 0432 .namedItem(QStringLiteral("ObjUrl")) 0433 .namedItem(QStringLiteral("Url")); 0434 if(!linkNode.isNull()) { 0435 QString u = linkNode.toElement().text(); 0436 // myDebug() << u; 0437 if(!u.isEmpty()) { 0438 if(!coll->hasField(QStringLiteral("url"))) { 0439 Data::FieldPtr field(new Data::Field(QStringLiteral("url"), i18n("URL"), Data::Field::URL)); 0440 field->setCategory(i18n("Miscellaneous")); 0441 coll->addField(field); 0442 } 0443 e->setField(QStringLiteral("url"), u); 0444 } 0445 } 0446 } 0447 0448 m_entries.insert(uid_, e); 0449 return e; 0450 } 0451 0452 void EntrezFetcher::initXSLTHandler() { 0453 QString xsltfile = DataFileRegistry::self()->locate(QStringLiteral("pubmed2tellico.xsl")); 0454 if(xsltfile.isEmpty()) { 0455 myWarning() << "can not locate pubmed2tellico.xsl."; 0456 return; 0457 } 0458 0459 QUrl u = QUrl::fromLocalFile(xsltfile); 0460 0461 if(!m_xsltHandler) { 0462 m_xsltHandler = new XSLTHandler(u); 0463 } 0464 if(!m_xsltHandler->isValid()) { 0465 myWarning() << "error in pubmed2tellico.xsl."; 0466 delete m_xsltHandler; 0467 m_xsltHandler = nullptr; 0468 return; 0469 } 0470 } 0471 0472 // without an API key, limit is 3 searches per second 0473 // with a key, limit is 10 0474 // https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/ 0475 void EntrezFetcher::markTime() { 0476 // not exactly the way to monitor rate over 3 or 10 calls, just a constant rate 0477 const int wait = m_apiKey.isEmpty() ? 350 : 110; 0478 while(m_idleTime.elapsed() < wait) { 0479 QThread::msleep(100); 0480 } 0481 m_idleTime.restart(); 0482 } 0483 0484 Tellico::Fetch::FetchRequest EntrezFetcher::updateRequest(Data::EntryPtr entry_) { 0485 QString s = entry_->field(QStringLiteral("pmid")); 0486 if(!s.isEmpty()) { 0487 return FetchRequest(PubmedID, s); 0488 } 0489 0490 s = entry_->field(QStringLiteral("doi")); 0491 if(!s.isEmpty()) { 0492 return FetchRequest(DOI, s); 0493 } 0494 0495 s = entry_->field(QStringLiteral("title")); 0496 if(!s.isEmpty()) { 0497 return FetchRequest(Title, s); 0498 } 0499 return FetchRequest(); 0500 } 0501 0502 QString EntrezFetcher::defaultName() { 0503 return i18n("Entrez Database"); 0504 } 0505 0506 QString EntrezFetcher::defaultIcon() { 0507 return favIcon("http://www.ncbi.nlm.nih.gov"); 0508 } 0509 0510 //static 0511 Tellico::StringHash EntrezFetcher::allOptionalFields() { 0512 StringHash hash; 0513 hash[QStringLiteral("institution")] = i18n("Institution"); 0514 hash[QStringLiteral("abstract")] = i18n("Abstract"); 0515 hash[QStringLiteral("url")] = i18n("URL"); 0516 return hash; 0517 } 0518 0519 Tellico::Fetch::ConfigWidget* EntrezFetcher::configWidget(QWidget* parent_) const { 0520 return new EntrezFetcher::ConfigWidget(parent_, this); 0521 } 0522 0523 EntrezFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const EntrezFetcher* fetcher_/*=0*/) 0524 : Fetch::ConfigWidget(parent_) { 0525 QGridLayout* l = new QGridLayout(optionsWidget()); 0526 l->setSpacing(4); 0527 l->setColumnStretch(1, 10); 0528 0529 int row = -1; 0530 0531 QLabel* label = new QLabel(i18n("Access key: "), optionsWidget()); 0532 l->addWidget(label, ++row, 0); 0533 0534 m_apiKeyEdit = new QLineEdit(optionsWidget()); 0535 connect(m_apiKeyEdit, &QLineEdit::textChanged, this, &ConfigWidget::slotSetModified); 0536 l->addWidget(m_apiKeyEdit, row, 1); 0537 QString w = i18n("The default Tellico key may be used, but searching may fail due to reaching access limits."); 0538 label->setWhatsThis(w); 0539 m_apiKeyEdit->setWhatsThis(w); 0540 label->setBuddy(m_apiKeyEdit); 0541 0542 l->setRowStretch(++row, 10); 0543 0544 // now add additional fields widget 0545 addFieldsWidget(EntrezFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList()); 0546 0547 if(fetcher_) { 0548 m_apiKeyEdit->setText(fetcher_->m_apiKey); 0549 } 0550 } 0551 0552 void EntrezFetcher::ConfigWidget::saveConfigHook(KConfigGroup& config_) { 0553 QString apiKey = m_apiKeyEdit->text().trimmed(); 0554 if(!apiKey.isEmpty()) { 0555 config_.writeEntry("API Key", apiKey); 0556 } 0557 } 0558 0559 QString EntrezFetcher::ConfigWidget::preferredName() const { 0560 return EntrezFetcher::defaultName(); 0561 }