File indexing completed on 2024-05-12 05:10:07

0001 /***************************************************************************
0002     Copyright (C) 2011 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "adsimporter.h"
0026 #include "../collections/bibtexcollection.h"
0027 #include "../entry.h"
0028 #include "../field.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../tellico_debug.h"
0032 
0033 #include <QRegularExpression>
0034 #include <QTextStream>
0035 
0036 using Tellico::Import::ADSImporter;
0037 QHash<QString, QString>* ADSImporter::s_tagMap = nullptr;
0038 
0039 // static
0040 void ADSImporter::initTagMap() {
0041   if(!s_tagMap) {
0042     s_tagMap = new QHash<QString, QString>();
0043     s_tagMap->insert(QStringLiteral("A"), QStringLiteral("author"));
0044     s_tagMap->insert(QStringLiteral("J"), QStringLiteral("journal"));
0045     s_tagMap->insert(QStringLiteral("V"), QStringLiteral("volume"));
0046     s_tagMap->insert(QStringLiteral("D"), QStringLiteral("year"));
0047     s_tagMap->insert(QStringLiteral("T"), QStringLiteral("title"));
0048     s_tagMap->insert(QStringLiteral("K"), QStringLiteral("keyword"));
0049     s_tagMap->insert(QStringLiteral("Y"), QStringLiteral("doi"));
0050     s_tagMap->insert(QStringLiteral("L"), QStringLiteral("pages"));
0051     s_tagMap->insert(QStringLiteral("B"), QStringLiteral("abstract"));
0052     s_tagMap->insert(QStringLiteral("U"), QStringLiteral("url"));
0053   }
0054 }
0055 
0056 ADSImporter::ADSImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) {
0057   initTagMap();
0058 }
0059 
0060 ADSImporter::ADSImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) {
0061   initTagMap();
0062 }
0063 
0064 bool ADSImporter::canImport(int type) const {
0065   return type == Data::Collection::Bibtex;
0066 }
0067 
0068 Tellico::Data::CollPtr ADSImporter::collection() {
0069   if(m_coll) {
0070     return m_coll;
0071   }
0072 
0073   m_coll = new Data::BibtexCollection(true);
0074   emit signalTotalSteps(this, urls().count() * 100);
0075 
0076   if(text().isEmpty()) {
0077     int count = 0;
0078     foreach(const QUrl& url, urls()) {
0079       if(m_cancelled)  {
0080         break;
0081       }
0082       readURL(url, count);
0083       ++count;
0084     }
0085   } else {
0086     readText(text(), 0);
0087   }
0088 
0089   if(m_cancelled) {
0090     m_coll = Data::CollPtr();
0091   }
0092   return m_coll;
0093 }
0094 
0095 void ADSImporter::readURL(const QUrl& url_, int n) {
0096   QString str = FileHandler::readTextFile(url_);
0097   if(str.isEmpty()) {
0098     return;
0099   }
0100   readText(str, n);
0101 }
0102 
0103 void ADSImporter::readText(const QString& text_, int n) {
0104   QString text = text_;
0105   QTextStream t(&text);
0106 
0107   const uint length = text.length();
0108   const uint stepSize = qMax(s_stepSize, length/100);
0109   const bool showProgress = options() & ImportProgress;
0110 
0111   bool needToAdd = false;
0112 
0113   QString sp, ep;
0114 
0115   uint j = 0;
0116   Data::EntryPtr entry(new Data::Entry(m_coll));
0117   // all ADS entries are journal articles
0118   entry->setField(QStringLiteral("entry-type"), QStringLiteral("article"));
0119 
0120   // technically, the spec requires a space immediately after the hyphen
0121   // however, at least one website (Springer) outputs RIS with no space after the final "ER -"
0122   // so just strip the white space later
0123   // also be gracious and allow any amount of space before hyphen
0124   static const QRegularExpression rx(QLatin1String("^\\s*%(\\w)\\s+(.*)$"));
0125   QString currLine, nextLine;
0126   for(currLine = t.readLine(); !m_cancelled && !currLine.isNull(); currLine = nextLine, j += currLine.length()) {
0127     nextLine = t.readLine();
0128     QRegularExpressionMatch m = rx.match(currLine);
0129     QString tag = m.captured(1);
0130     QString value = m.captured(2).trimmed();
0131     if(tag.isEmpty()) {
0132       continue;
0133     }
0134 //    myDebug() << tag << ":" << value;
0135     // if the next line is not empty and does not match start regexp, append to value
0136     while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) {
0137       value += nextLine.trimmed();
0138       nextLine = t.readLine();
0139     }
0140 
0141     // every entry begins with "R"
0142     if(tag == QLatin1String("R")) {
0143       if(needToAdd) {
0144         m_coll->addEntries(entry);
0145       }
0146       entry = new Data::Entry(m_coll);
0147       entry->setField(QStringLiteral("entry-type"), QStringLiteral("article"));
0148       continue;
0149     } else if(tag == QLatin1String("P")) {
0150       sp = value;
0151       if(!ep.isEmpty()) {
0152         int startPage = sp.toInt();
0153         int endPage = ep.toInt();
0154         if(endPage > 0 && endPage < startPage) {
0155           myWarning() << "Assuming end page is really page count";
0156           ep = QString::number(startPage + endPage);
0157         }
0158         value = sp + QLatin1Char('-') + ep;
0159         tag = QStringLiteral("L");
0160         sp.clear();
0161         ep.clear();
0162       } else {
0163         // nothing else to do
0164         continue;
0165       }
0166     } else if(tag == QLatin1String("L")) {
0167       ep = value;
0168       if(!sp.isEmpty()) {
0169         int startPage = sp.toInt();
0170         int endPage = ep.toInt();
0171         if(endPage > 0 && endPage < startPage) {
0172           myWarning() << "Assuming end page is really page count";
0173           ep = QString::number(startPage + endPage);
0174         }
0175         value = sp + QLatin1Char('-') + ep;
0176         sp.clear();
0177         ep.clear();
0178       } else {
0179         continue;
0180       }
0181     } else if(tag == QLatin1String("D")) {  // for now, just grab the year
0182       value = value.section(QLatin1Char('/'), 1, 1);
0183     } else if(tag == QLatin1String("K")) {  // split the keywords
0184       value = value.split(QLatin1Char(',')).join(FieldFormat::delimiterString());
0185     } else if(tag == QLatin1String("Y")) {  // clean-up DOI
0186       static const QRegularExpression doiRx(QLatin1String("^\\s*DOI[\\s:]*"), QRegularExpression::CaseInsensitiveOption);
0187       value.remove(doiRx);
0188       value = value.section(QLatin1Char(';'), 0, 0);
0189     } else if(tag == QLatin1String("J")) {  // clean-up journal
0190       static const QRegularExpression commaRx(QLatin1String("\\s*,\\s*"));
0191       QStringList tokens = value.split(commaRx);
0192       if(!tokens.isEmpty()) {
0193         value = tokens.first();
0194       }
0195     }
0196 
0197     // the lookup scheme is:
0198     // 1. any field has an RIS property that matches the tag name
0199     // 2. default field mapping tag -> field name
0200     Data::FieldPtr f = fieldByTag(tag);
0201     if(!f) {
0202       continue;
0203     }
0204     needToAdd = true;
0205 
0206     // if the field can have multiple values, append current values to new value
0207     if(f->hasFlag(Data::Field::AllowMultiple) && !entry->field(f).isEmpty()) {
0208       value.prepend(entry->field(f) + FieldFormat::delimiterString());
0209     }
0210     entry->setField(f, value);
0211 
0212     if(showProgress && j%stepSize == 0) {
0213       emit signalProgress(this, n*100 + 100*j/length);
0214     }
0215   }
0216 
0217   if(needToAdd) {
0218     m_coll->addEntries(entry);
0219   }
0220 }
0221 
0222 Tellico::Data::FieldPtr ADSImporter::fieldByTag(const QString& tag_) {
0223   Data::FieldPtr f;
0224   const QString& fieldTag = (*s_tagMap)[tag_];
0225   if(!fieldTag.isEmpty()) {
0226     f = m_coll->fieldByName(fieldTag);
0227     if(!f) {
0228       myDebug() << "no field found for" << fieldTag;
0229     }
0230   }
0231 
0232   // add non-default fields if not already there
0233   if(tag_== QLatin1String("L1")) {
0234 //    f = new Data::Field(QLatin1String("pdf"), i18n("PDF"), Data::Field::URL);
0235 //    f->setProperty(QLatin1String("ris"), QLatin1String("L1"));
0236 //    f->setCategory(i18n("Miscellaneous"));
0237   }
0238 //  m_coll->addField(f);
0239   return f;
0240 }
0241 
0242 void ADSImporter::slotCancel() {
0243   m_cancelled = true;
0244 }