File indexing completed on 2024-05-12 05:10:07
0001 /*************************************************************************** 0002 Copyright (C) 2011 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "adsimporter.h" 0026 #include "../collections/bibtexcollection.h" 0027 #include "../entry.h" 0028 #include "../field.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../tellico_debug.h" 0032 0033 #include <QRegularExpression> 0034 #include <QTextStream> 0035 0036 using Tellico::Import::ADSImporter; 0037 QHash<QString, QString>* ADSImporter::s_tagMap = nullptr; 0038 0039 // static 0040 void ADSImporter::initTagMap() { 0041 if(!s_tagMap) { 0042 s_tagMap = new QHash<QString, QString>(); 0043 s_tagMap->insert(QStringLiteral("A"), QStringLiteral("author")); 0044 s_tagMap->insert(QStringLiteral("J"), QStringLiteral("journal")); 0045 s_tagMap->insert(QStringLiteral("V"), QStringLiteral("volume")); 0046 s_tagMap->insert(QStringLiteral("D"), QStringLiteral("year")); 0047 s_tagMap->insert(QStringLiteral("T"), QStringLiteral("title")); 0048 s_tagMap->insert(QStringLiteral("K"), QStringLiteral("keyword")); 0049 s_tagMap->insert(QStringLiteral("Y"), QStringLiteral("doi")); 0050 s_tagMap->insert(QStringLiteral("L"), QStringLiteral("pages")); 0051 s_tagMap->insert(QStringLiteral("B"), QStringLiteral("abstract")); 0052 s_tagMap->insert(QStringLiteral("U"), QStringLiteral("url")); 0053 } 0054 } 0055 0056 ADSImporter::ADSImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) { 0057 initTagMap(); 0058 } 0059 0060 ADSImporter::ADSImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) { 0061 initTagMap(); 0062 } 0063 0064 bool ADSImporter::canImport(int type) const { 0065 return type == Data::Collection::Bibtex; 0066 } 0067 0068 Tellico::Data::CollPtr ADSImporter::collection() { 0069 if(m_coll) { 0070 return m_coll; 0071 } 0072 0073 m_coll = new Data::BibtexCollection(true); 0074 emit signalTotalSteps(this, urls().count() * 100); 0075 0076 if(text().isEmpty()) { 0077 int count = 0; 0078 foreach(const QUrl& url, urls()) { 0079 if(m_cancelled) { 0080 break; 0081 } 0082 readURL(url, count); 0083 ++count; 0084 } 0085 } else { 0086 readText(text(), 0); 0087 } 0088 0089 if(m_cancelled) { 0090 m_coll = Data::CollPtr(); 0091 } 0092 return m_coll; 0093 } 0094 0095 void ADSImporter::readURL(const QUrl& url_, int n) { 0096 QString str = FileHandler::readTextFile(url_); 0097 if(str.isEmpty()) { 0098 return; 0099 } 0100 readText(str, n); 0101 } 0102 0103 void ADSImporter::readText(const QString& text_, int n) { 0104 QString text = text_; 0105 QTextStream t(&text); 0106 0107 const uint length = text.length(); 0108 const uint stepSize = qMax(s_stepSize, length/100); 0109 const bool showProgress = options() & ImportProgress; 0110 0111 bool needToAdd = false; 0112 0113 QString sp, ep; 0114 0115 uint j = 0; 0116 Data::EntryPtr entry(new Data::Entry(m_coll)); 0117 // all ADS entries are journal articles 0118 entry->setField(QStringLiteral("entry-type"), QStringLiteral("article")); 0119 0120 // technically, the spec requires a space immediately after the hyphen 0121 // however, at least one website (Springer) outputs RIS with no space after the final "ER -" 0122 // so just strip the white space later 0123 // also be gracious and allow any amount of space before hyphen 0124 static const QRegularExpression rx(QLatin1String("^\\s*%(\\w)\\s+(.*)$")); 0125 QString currLine, nextLine; 0126 for(currLine = t.readLine(); !m_cancelled && !currLine.isNull(); currLine = nextLine, j += currLine.length()) { 0127 nextLine = t.readLine(); 0128 QRegularExpressionMatch m = rx.match(currLine); 0129 QString tag = m.captured(1); 0130 QString value = m.captured(2).trimmed(); 0131 if(tag.isEmpty()) { 0132 continue; 0133 } 0134 // myDebug() << tag << ":" << value; 0135 // if the next line is not empty and does not match start regexp, append to value 0136 while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) { 0137 value += nextLine.trimmed(); 0138 nextLine = t.readLine(); 0139 } 0140 0141 // every entry begins with "R" 0142 if(tag == QLatin1String("R")) { 0143 if(needToAdd) { 0144 m_coll->addEntries(entry); 0145 } 0146 entry = new Data::Entry(m_coll); 0147 entry->setField(QStringLiteral("entry-type"), QStringLiteral("article")); 0148 continue; 0149 } else if(tag == QLatin1String("P")) { 0150 sp = value; 0151 if(!ep.isEmpty()) { 0152 int startPage = sp.toInt(); 0153 int endPage = ep.toInt(); 0154 if(endPage > 0 && endPage < startPage) { 0155 myWarning() << "Assuming end page is really page count"; 0156 ep = QString::number(startPage + endPage); 0157 } 0158 value = sp + QLatin1Char('-') + ep; 0159 tag = QStringLiteral("L"); 0160 sp.clear(); 0161 ep.clear(); 0162 } else { 0163 // nothing else to do 0164 continue; 0165 } 0166 } else if(tag == QLatin1String("L")) { 0167 ep = value; 0168 if(!sp.isEmpty()) { 0169 int startPage = sp.toInt(); 0170 int endPage = ep.toInt(); 0171 if(endPage > 0 && endPage < startPage) { 0172 myWarning() << "Assuming end page is really page count"; 0173 ep = QString::number(startPage + endPage); 0174 } 0175 value = sp + QLatin1Char('-') + ep; 0176 sp.clear(); 0177 ep.clear(); 0178 } else { 0179 continue; 0180 } 0181 } else if(tag == QLatin1String("D")) { // for now, just grab the year 0182 value = value.section(QLatin1Char('/'), 1, 1); 0183 } else if(tag == QLatin1String("K")) { // split the keywords 0184 value = value.split(QLatin1Char(',')).join(FieldFormat::delimiterString()); 0185 } else if(tag == QLatin1String("Y")) { // clean-up DOI 0186 static const QRegularExpression doiRx(QLatin1String("^\\s*DOI[\\s:]*"), QRegularExpression::CaseInsensitiveOption); 0187 value.remove(doiRx); 0188 value = value.section(QLatin1Char(';'), 0, 0); 0189 } else if(tag == QLatin1String("J")) { // clean-up journal 0190 static const QRegularExpression commaRx(QLatin1String("\\s*,\\s*")); 0191 QStringList tokens = value.split(commaRx); 0192 if(!tokens.isEmpty()) { 0193 value = tokens.first(); 0194 } 0195 } 0196 0197 // the lookup scheme is: 0198 // 1. any field has an RIS property that matches the tag name 0199 // 2. default field mapping tag -> field name 0200 Data::FieldPtr f = fieldByTag(tag); 0201 if(!f) { 0202 continue; 0203 } 0204 needToAdd = true; 0205 0206 // if the field can have multiple values, append current values to new value 0207 if(f->hasFlag(Data::Field::AllowMultiple) && !entry->field(f).isEmpty()) { 0208 value.prepend(entry->field(f) + FieldFormat::delimiterString()); 0209 } 0210 entry->setField(f, value); 0211 0212 if(showProgress && j%stepSize == 0) { 0213 emit signalProgress(this, n*100 + 100*j/length); 0214 } 0215 } 0216 0217 if(needToAdd) { 0218 m_coll->addEntries(entry); 0219 } 0220 } 0221 0222 Tellico::Data::FieldPtr ADSImporter::fieldByTag(const QString& tag_) { 0223 Data::FieldPtr f; 0224 const QString& fieldTag = (*s_tagMap)[tag_]; 0225 if(!fieldTag.isEmpty()) { 0226 f = m_coll->fieldByName(fieldTag); 0227 if(!f) { 0228 myDebug() << "no field found for" << fieldTag; 0229 } 0230 } 0231 0232 // add non-default fields if not already there 0233 if(tag_== QLatin1String("L1")) { 0234 // f = new Data::Field(QLatin1String("pdf"), i18n("PDF"), Data::Field::URL); 0235 // f->setProperty(QLatin1String("ris"), QLatin1String("L1")); 0236 // f->setCategory(i18n("Miscellaneous")); 0237 } 0238 // m_coll->addField(f); 0239 return f; 0240 } 0241 0242 void ADSImporter::slotCancel() { 0243 m_cancelled = true; 0244 }