File indexing completed on 2024-05-12 05:09:31
0001 /*************************************************************************** 0002 Copyright (C) 2005-2020 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "entrezfetcher.h" 0026 #include "../utils/guiproxy.h" 0027 #include "../collection.h" 0028 #include "../entry.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../translators/xslthandler.h" 0032 #include "../translators/tellicoimporter.h" 0033 #include "../utils/datafileregistry.h" 0034 #include "../tellico_debug.h" 0035 0036 #include <KLocalizedString> 0037 #include <KIO/Job> 0038 #include <KIO/JobUiDelegate> 0039 #include <KConfigGroup> 0040 #include <KJobWidgets/KJobWidgets> 0041 0042 #include <QDomDocument> 0043 #include <QLabel> 0044 #include <QFile> 0045 #include <QTextStream> 0046 #include <QGridLayout> 0047 #include <QLineEdit> 0048 #include <QUrlQuery> 0049 #include <QThread> 0050 #include <QJsonDocument> 0051 #include <QJsonObject> 0052 0053 namespace { 0054 static const int ENTREZ_MAX_RETURNS_TOTAL = 25; 0055 static const char* ENTREZ_BASE_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"; 0056 static const char* ENTREZ_SEARCH_CGI = "esearch.fcgi"; 0057 static const char* ENTREZ_SUMMARY_CGI = "esummary.fcgi"; 0058 static const char* ENTREZ_FETCH_CGI = "efetch.fcgi"; 0059 static const char* ENTREZ_LINK_CGI = "elink.fcgi"; 0060 static const char* ENTREZ_DEFAULT_DATABASE = "pubmed"; 0061 } 0062 0063 using namespace Tellico; 0064 using namespace Tellico::Fetch; 0065 using Tellico::Fetch::EntrezFetcher; 0066 0067 EntrezFetcher::EntrezFetcher(QObject* parent_) : Fetcher(parent_), m_xsltHandler(nullptr), 0068 m_start(1), m_total(-1), m_step(Step::Begin), m_started(false) { 0069 m_idleTime.start(); 0070 } 0071 0072 EntrezFetcher::~EntrezFetcher() { 0073 } 0074 0075 QString EntrezFetcher::source() const { 0076 return m_name.isEmpty() ? defaultName() : m_name; 0077 } 0078 0079 bool EntrezFetcher::canSearch(Fetch::FetchKey k) const { 0080 return k == Title || k == Person || k == Keyword || k == Raw || k == PubmedID || k == DOI; 0081 } 0082 0083 bool EntrezFetcher::canFetch(int type) const { 0084 return type == Data::Collection::Bibtex; 0085 } 0086 0087 void EntrezFetcher::readConfigHook(const KConfigGroup& config_) { 0088 QString s = config_.readEntry("Database", ENTREZ_DEFAULT_DATABASE); // default to pubmed 0089 if(!s.isEmpty()) { 0090 m_dbname = s; 0091 } 0092 QString k = config_.readEntry("API Key"); 0093 if(!k.isEmpty()) { 0094 m_apiKey = k; 0095 } 0096 } 0097 0098 void EntrezFetcher::search() { 0099 m_started = true; 0100 m_start = 1; 0101 m_total = -1; 0102 0103 if(m_dbname.isEmpty()) { 0104 m_dbname = QLatin1String(ENTREZ_DEFAULT_DATABASE); 0105 } 0106 0107 QUrl u(QString::fromLatin1(ENTREZ_BASE_URL)); 0108 u.setPath(u.path() + QLatin1String(ENTREZ_SEARCH_CGI)); 0109 QUrlQuery q; 0110 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0111 q.addQueryItem(QStringLiteral("retmode"), QStringLiteral("xml")); 0112 q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y")); 0113 q.addQueryItem(QStringLiteral("retmax"), QStringLiteral("1")); // we're just getting the count 0114 q.addQueryItem(QStringLiteral("db"), m_dbname); 0115 q.addQueryItem(QStringLiteral("term"), request().value()); 0116 switch(request().key()) { 0117 case Title: 0118 q.addQueryItem(QStringLiteral("field"), QStringLiteral("titl")); 0119 break; 0120 0121 case Person: 0122 q.addQueryItem(QStringLiteral("field"), QStringLiteral("auth")); 0123 break; 0124 0125 case Keyword: 0126 // for Tellico Keyword searches basically mean search for any field matching 0127 // q.addQueryItem(QLatin1String("field"), QLatin1String("word")); 0128 break; 0129 0130 case PubmedID: 0131 q.addQueryItem(QStringLiteral("field"), QStringLiteral("pmid")); 0132 break; 0133 0134 case DOI: 0135 case Raw: 0136 // for DOI, enough to match any field to DOI value 0137 //q.setQuery(u.query() + QLatin1Char('&') + request().value()); 0138 break; 0139 0140 default: 0141 myWarning() << source() << "- key not recognized:" << request().key(); 0142 stop(); 0143 return; 0144 } 0145 if(!m_apiKey.isEmpty()) { 0146 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0147 } 0148 u.setQuery(q); 0149 0150 m_step = Step::Search; 0151 // myLog() << "search url: " << u.url(); 0152 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0153 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0154 connect(m_job.data(), &KJob::result, 0155 this, &EntrezFetcher::slotComplete); 0156 markTime(); 0157 } 0158 0159 void EntrezFetcher::continueSearch() { 0160 m_started = true; 0161 doSummary(); 0162 } 0163 0164 void EntrezFetcher::stop() { 0165 if(!m_started) { 0166 return; 0167 } 0168 if(m_job) { 0169 m_job->kill(); 0170 m_job = nullptr; 0171 } 0172 m_started = false; 0173 m_step = Step::Begin; 0174 emit signalDone(this); 0175 } 0176 0177 void EntrezFetcher::slotComplete(KJob*) { 0178 Q_ASSERT(m_job); 0179 if(m_job->error()) { 0180 m_job->uiDelegate()->showErrorMessage(); 0181 stop(); 0182 return; 0183 } 0184 0185 QByteArray data = m_job->data(); 0186 if(data.isEmpty()) { 0187 myDebug() << "no data"; 0188 stop(); 0189 return; 0190 } 0191 // see bug 319662. If fetcher is cancelled, job is killed 0192 // if the pointer is retained, it gets double-deleted 0193 m_job = nullptr; 0194 0195 #if 0 0196 myWarning() << "Remove debug from entrezfetcher.cpp: " << __LINE__; 0197 QFile f(QLatin1String("/tmp/test.xml")); 0198 if(f.open(QIODevice::WriteOnly)) { 0199 QTextStream t(&f); 0200 t.setCodec("UTF-8"); 0201 t << data; 0202 } 0203 f.close(); 0204 #endif 0205 0206 switch(m_step) { 0207 case Step::Search: 0208 searchResults(data); 0209 break; 0210 case Step::Summary: 0211 summaryResults(data); 0212 break; 0213 case Step::Begin: 0214 case Step::Fetch: 0215 default: 0216 myLog() << "wrong step =" << int(m_step); 0217 stop(); 0218 break; 0219 } 0220 } 0221 0222 void EntrezFetcher::searchResults(const QByteArray& data_) { 0223 QDomDocument dom; 0224 if(!dom.setContent(data_, false)) { 0225 myWarning() << "server did not return valid XML."; 0226 stop(); 0227 return; 0228 } 0229 // find Count, QueryKey, and WebEnv elements 0230 int count = 0; 0231 for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) { 0232 QDomElement e = n.toElement(); 0233 if(e.isNull()) { 0234 continue; 0235 } 0236 if(e.tagName() == QLatin1String("Count")) { 0237 m_total = e.text().toInt(); 0238 ++count; 0239 } else if(e.tagName() == QLatin1String("QueryKey")) { 0240 m_queryKey = e.text(); 0241 ++count; 0242 } else if(e.tagName() == QLatin1String("WebEnv")) { 0243 m_webEnv = e.text(); 0244 ++count; 0245 } 0246 if(count >= 3) { 0247 break; // found them all 0248 } 0249 } 0250 0251 doSummary(); 0252 } 0253 0254 void EntrezFetcher::doSummary() { 0255 QUrl u(QString::fromLatin1(ENTREZ_BASE_URL)); 0256 u.setPath(u.path() + QLatin1String(ENTREZ_SUMMARY_CGI)); 0257 QUrlQuery q; 0258 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0259 q.addQueryItem(QStringLiteral("retmode"), QStringLiteral("xml")); 0260 if(m_start > 1) { 0261 q.addQueryItem(QStringLiteral("retstart"), QString::number(m_start)); 0262 } 0263 q.addQueryItem(QStringLiteral("retmax"), QString::number(qMin(m_total-m_start-1, ENTREZ_MAX_RETURNS_TOTAL))); 0264 q.addQueryItem(QStringLiteral("usehistory"), QStringLiteral("y")); 0265 q.addQueryItem(QStringLiteral("db"), m_dbname); 0266 q.addQueryItem(QStringLiteral("query_key"), m_queryKey); 0267 q.addQueryItem(QStringLiteral("WebEnv"), m_webEnv); 0268 if(!m_apiKey.isEmpty()) { 0269 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0270 } 0271 u.setQuery(q); 0272 0273 m_step = Step::Summary; 0274 // myLog() << "summary url:" << u.url(); 0275 m_job = KIO::storedGet(u, KIO::NoReload, KIO::HideProgressInfo); 0276 KJobWidgets::setWindow(m_job, GUI::Proxy::widget()); 0277 connect(m_job.data(), &KJob::result, 0278 this, &EntrezFetcher::slotComplete); 0279 markTime(); 0280 } 0281 0282 void EntrezFetcher::summaryResults(const QByteArray& data_) { 0283 QDomDocument dom; 0284 if(!dom.setContent(data_, false)) { 0285 myWarning() << "server did not return valid XML."; 0286 stop(); 0287 return; 0288 } 0289 // top child is eSummaryResult 0290 // all children are DocSum 0291 for(QDomNode n = dom.documentElement().firstChild(); !n.isNull(); n = n.nextSibling()) { 0292 QDomElement e = n.toElement(); 0293 if(e.isNull() || e.tagName() != QLatin1String("DocSum")) { 0294 continue; 0295 } 0296 QDomNodeList nodes = e.elementsByTagName(QStringLiteral("Id")); 0297 if(nodes.count() == 0) { 0298 myDebug() << "no Id elements"; 0299 continue; 0300 } 0301 int id = nodes.item(0).toElement().text().toInt(); 0302 QString title, pubdate, authors; 0303 nodes = e.elementsByTagName(QStringLiteral("Item")); 0304 for(int j = 0; j < nodes.count(); ++j) { 0305 const auto elem = nodes.item(j).toElement(); 0306 if(elem.attribute(QStringLiteral("Name")) == QLatin1String("Title")) { 0307 title = elem.text(); 0308 } else if(elem.attribute(QStringLiteral("Name")) == QLatin1String("PubDate")) { 0309 pubdate = elem.text(); 0310 } else if(elem.attribute(QStringLiteral("Name")) == QLatin1String("AuthorList")) { 0311 QStringList list; 0312 for(QDomNode aNode = nodes.item(j).firstChild(); !aNode.isNull(); aNode = aNode.nextSibling()) { 0313 // lazy, assume all children Items are authors 0314 if(aNode.nodeName() == QLatin1String("Item")) { 0315 list << aNode.toElement().text(); 0316 } 0317 } 0318 authors = list.join(FieldFormat::delimiterString()); 0319 } 0320 if(!title.isEmpty() && !pubdate.isEmpty() && !authors.isEmpty()) { 0321 break; // done now 0322 } 0323 } 0324 FetchResult* r = new FetchResult(this, title, pubdate + QLatin1Char('/') + authors); 0325 m_matches.insert(r->uid, id); 0326 emit signalResultFound(r); 0327 } 0328 m_start = m_matches.count() + 1; 0329 m_hasMoreResults = m_start <= m_total; 0330 stop(); // done searching 0331 } 0332 0333 Tellico::Data::EntryPtr EntrezFetcher::fetchEntryHook(uint uid_) { 0334 // if we already grabbed this one, then just pull it out of the dict 0335 Data::EntryPtr entry = m_entries[uid_]; 0336 if(entry) { 0337 return entry; 0338 } 0339 0340 if(!m_matches.contains(uid_)) { 0341 return Data::EntryPtr(); 0342 } 0343 0344 if(!m_xsltHandler) { 0345 initXSLTHandler(); 0346 if(!m_xsltHandler) { // probably an error somewhere in the stylesheet loading 0347 stop(); 0348 return Data::EntryPtr(); 0349 } 0350 } 0351 0352 int id = m_matches[uid_]; 0353 0354 QUrl u(QString::fromLatin1(ENTREZ_BASE_URL)); 0355 u.setPath(u.path() + QLatin1String(ENTREZ_FETCH_CGI)); 0356 QUrlQuery q; 0357 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0358 q.addQueryItem(QStringLiteral("retmode"), QStringLiteral("xml")); 0359 q.addQueryItem(QStringLiteral("rettype"), QStringLiteral("abstract")); 0360 q.addQueryItem(QStringLiteral("db"), m_dbname); 0361 q.addQueryItem(QStringLiteral("id"), QString::number(id)); 0362 if(!m_apiKey.isEmpty()) { 0363 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0364 } 0365 u.setQuery(q); 0366 0367 // now it's synchronous 0368 // myDebug() << "id url:" << u.url(); 0369 markTime(); 0370 QString xmlOutput = FileHandler::readXMLFile(u, true /*quiet*/); 0371 if(xmlOutput.isEmpty()) { 0372 myWarning() << "unable to download " << u; 0373 return Data::EntryPtr(); 0374 } 0375 #if 0 0376 myWarning() << "turn me off in entrezfetcher.cpp!"; 0377 QFile f1(QLatin1String("/tmp/test-entry.xml")); 0378 if(f1.open(QIODevice::WriteOnly)) { 0379 QTextStream t(&f1); 0380 t.setCodec("UTF-8"); 0381 t << xmlOutput; 0382 } 0383 f1.close(); 0384 #endif 0385 QString str = m_xsltHandler->applyStylesheet(xmlOutput); 0386 if(str.isEmpty()) { 0387 // might be an API error, and message is in JSON 0388 QJsonDocument doc = QJsonDocument::fromJson(xmlOutput.toUtf8()); 0389 if(!doc.isNull() && doc.object().contains(QStringLiteral("error"))) { 0390 const QString error = doc.object().value(QStringLiteral("error")).toString(); 0391 message(error, MessageHandler::Error); 0392 myLog() << "EntrezFetcher -" << error; 0393 } 0394 return Data::EntryPtr(); 0395 } 0396 Import::TellicoImporter imp(str); 0397 Data::CollPtr coll = imp.collection(); 0398 if(!coll) { 0399 myWarning() << "invalid collection"; 0400 return Data::EntryPtr(); 0401 } 0402 if(coll->entryCount() == 0) { 0403 myDebug() << "no entries in collection"; 0404 return Data::EntryPtr(); 0405 } else if(coll->entryCount() > 1) { 0406 myDebug() << "collection has multiple entries, taking first one"; 0407 } 0408 0409 Data::EntryPtr e = coll->entries().front(); 0410 0411 // try to get a link, but only if necessary 0412 if(optionalFields().contains(QStringLiteral("url"))) { 0413 QUrl link(QString::fromLatin1(ENTREZ_BASE_URL)); 0414 link.setPath(link.path() + QLatin1String(ENTREZ_LINK_CGI)); 0415 QUrlQuery q; 0416 q.addQueryItem(QStringLiteral("tool"), QStringLiteral("Tellico")); 0417 q.addQueryItem(QStringLiteral("cmd"), QStringLiteral("llinks")); 0418 q.addQueryItem(QStringLiteral("db"), m_dbname); 0419 q.addQueryItem(QStringLiteral("dbfrom"), m_dbname); 0420 q.addQueryItem(QStringLiteral("id"), QString::number(id)); 0421 if(!m_apiKey.isEmpty()) { 0422 q.addQueryItem(QStringLiteral("api_key"), m_apiKey); 0423 } 0424 link.setQuery(q); 0425 0426 markTime(); 0427 QDomDocument linkDom = FileHandler::readXMLDocument(link, false /* namespace */, true /* quiet */); 0428 // need eLinkResult/LinkSet/IdUrlList/IdUrlSet/ObjUrl/Url 0429 QDomNode linkNode = linkDom.namedItem(QStringLiteral("eLinkResult")) 0430 .namedItem(QStringLiteral("LinkSet")) 0431 .namedItem(QStringLiteral("IdUrlList")) 0432 .namedItem(QStringLiteral("IdUrlSet")) 0433 .namedItem(QStringLiteral("ObjUrl")) 0434 .namedItem(QStringLiteral("Url")); 0435 if(!linkNode.isNull()) { 0436 QString u = linkNode.toElement().text(); 0437 // myDebug() << u; 0438 if(!u.isEmpty()) { 0439 if(!coll->hasField(QStringLiteral("url"))) { 0440 Data::FieldPtr field(new Data::Field(QStringLiteral("url"), i18n("URL"), Data::Field::URL)); 0441 field->setCategory(i18n("Miscellaneous")); 0442 coll->addField(field); 0443 } 0444 e->setField(QStringLiteral("url"), u); 0445 } 0446 } 0447 } 0448 0449 m_entries.insert(uid_, e); 0450 return e; 0451 } 0452 0453 void EntrezFetcher::initXSLTHandler() { 0454 QString xsltfile = DataFileRegistry::self()->locate(QStringLiteral("pubmed2tellico.xsl")); 0455 if(xsltfile.isEmpty()) { 0456 myWarning() << "can not locate pubmed2tellico.xsl."; 0457 return; 0458 } 0459 0460 QUrl u = QUrl::fromLocalFile(xsltfile); 0461 0462 if(!m_xsltHandler) { 0463 m_xsltHandler = new XSLTHandler(u); 0464 } 0465 if(!m_xsltHandler->isValid()) { 0466 myWarning() << "error in pubmed2tellico.xsl."; 0467 delete m_xsltHandler; 0468 m_xsltHandler = nullptr; 0469 return; 0470 } 0471 } 0472 0473 // without an API key, limit is 3 searches per second 0474 // with a key, limit is 10 0475 // https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/ 0476 void EntrezFetcher::markTime() { 0477 // not exactly the way to monitor rate over 3 or 10 calls, just a constant rate 0478 const int wait = m_apiKey.isEmpty() ? 350 : 110; 0479 while(m_idleTime.elapsed() < wait) { 0480 QThread::msleep(100); 0481 } 0482 m_idleTime.restart(); 0483 } 0484 0485 Tellico::Fetch::FetchRequest EntrezFetcher::updateRequest(Data::EntryPtr entry_) { 0486 QString s = entry_->field(QStringLiteral("pmid")); 0487 if(!s.isEmpty()) { 0488 return FetchRequest(PubmedID, s); 0489 } 0490 0491 s = entry_->field(QStringLiteral("doi")); 0492 if(!s.isEmpty()) { 0493 return FetchRequest(DOI, s); 0494 } 0495 0496 s = entry_->field(QStringLiteral("title")); 0497 if(!s.isEmpty()) { 0498 return FetchRequest(Title, s); 0499 } 0500 return FetchRequest(); 0501 } 0502 0503 QString EntrezFetcher::defaultName() { 0504 return i18n("Entrez Database"); 0505 } 0506 0507 QString EntrezFetcher::defaultIcon() { 0508 return favIcon("http://www.ncbi.nlm.nih.gov"); 0509 } 0510 0511 //static 0512 Tellico::StringHash EntrezFetcher::allOptionalFields() { 0513 StringHash hash; 0514 hash[QStringLiteral("institution")] = i18n("Institution"); 0515 hash[QStringLiteral("abstract")] = i18n("Abstract"); 0516 hash[QStringLiteral("url")] = i18n("URL"); 0517 return hash; 0518 } 0519 0520 Tellico::Fetch::ConfigWidget* EntrezFetcher::configWidget(QWidget* parent_) const { 0521 return new EntrezFetcher::ConfigWidget(parent_, this); 0522 } 0523 0524 EntrezFetcher::ConfigWidget::ConfigWidget(QWidget* parent_, const EntrezFetcher* fetcher_/*=0*/) 0525 : Fetch::ConfigWidget(parent_) { 0526 QGridLayout* l = new QGridLayout(optionsWidget()); 0527 l->setSpacing(4); 0528 l->setColumnStretch(1, 10); 0529 0530 int row = -1; 0531 0532 QLabel* label = new QLabel(i18n("Access key: "), optionsWidget()); 0533 l->addWidget(label, ++row, 0); 0534 0535 m_apiKeyEdit = new QLineEdit(optionsWidget()); 0536 connect(m_apiKeyEdit, &QLineEdit::textChanged, this, &ConfigWidget::slotSetModified); 0537 l->addWidget(m_apiKeyEdit, row, 1); 0538 QString w = i18n("The default Tellico key may be used, but searching may fail due to reaching access limits."); 0539 label->setWhatsThis(w); 0540 m_apiKeyEdit->setWhatsThis(w); 0541 label->setBuddy(m_apiKeyEdit); 0542 0543 l->setRowStretch(++row, 10); 0544 0545 // now add additional fields widget 0546 addFieldsWidget(EntrezFetcher::allOptionalFields(), fetcher_ ? fetcher_->optionalFields() : QStringList()); 0547 0548 if(fetcher_) { 0549 m_apiKeyEdit->setText(fetcher_->m_apiKey); 0550 } 0551 } 0552 0553 void EntrezFetcher::ConfigWidget::saveConfigHook(KConfigGroup& config_) { 0554 QString apiKey = m_apiKeyEdit->text().trimmed(); 0555 if(!apiKey.isEmpty()) { 0556 config_.writeEntry("API Key", apiKey); 0557 } 0558 } 0559 0560 QString EntrezFetcher::ConfigWidget::preferredName() const { 0561 return EntrezFetcher::defaultName(); 0562 }