File indexing completed on 2024-05-12 05:10:13
0001 /*************************************************************************** 0002 Copyright (C) 2004-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "risimporter.h" 0026 #include "../collections/bibtexcollection.h" 0027 #include "../entry.h" 0028 #include "../field.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../utils/isbnvalidator.h" 0032 #include "../tellico_debug.h" 0033 0034 #include <KLocalizedString> 0035 0036 #include <QRegularExpression> 0037 #include <QTextStream> 0038 0039 using Tellico::Import::RISImporter; 0040 QHash<QString, QString>* RISImporter::s_tagMap = nullptr; 0041 QHash<QString, QString>* RISImporter::s_typeMap = nullptr; 0042 0043 // static 0044 void RISImporter::initTagMap() { 0045 if(!s_tagMap) { 0046 s_tagMap = new QHash<QString, QString>(); 0047 // BT is special and is handled separately 0048 s_tagMap->insert(QStringLiteral("TY"), QStringLiteral("entry-type")); 0049 s_tagMap->insert(QStringLiteral("ID"), QStringLiteral("bibtex-key")); 0050 s_tagMap->insert(QStringLiteral("T1"), QStringLiteral("title")); 0051 s_tagMap->insert(QStringLiteral("TI"), QStringLiteral("title")); 0052 s_tagMap->insert(QStringLiteral("T2"), QStringLiteral("booktitle")); 0053 s_tagMap->insert(QStringLiteral("A1"), QStringLiteral("author")); 0054 s_tagMap->insert(QStringLiteral("AU"), QStringLiteral("author")); 0055 s_tagMap->insert(QStringLiteral("ED"), QStringLiteral("editor")); 0056 s_tagMap->insert(QStringLiteral("YR"), QStringLiteral("year")); 0057 s_tagMap->insert(QStringLiteral("PY"), QStringLiteral("year")); 0058 s_tagMap->insert(QStringLiteral("Y1"), QStringLiteral("year")); 0059 s_tagMap->insert(QStringLiteral("N1"), QStringLiteral("note")); 0060 s_tagMap->insert(QStringLiteral("AB"), QStringLiteral("abstract")); // should be note? 0061 s_tagMap->insert(QStringLiteral("N2"), QStringLiteral("abstract")); 0062 s_tagMap->insert(QStringLiteral("KW"), QStringLiteral("keyword")); 0063 s_tagMap->insert(QStringLiteral("JF"), QStringLiteral("journal")); 0064 s_tagMap->insert(QStringLiteral("JO"), QStringLiteral("journal")); 0065 s_tagMap->insert(QStringLiteral("JA"), QStringLiteral("journal")); 0066 s_tagMap->insert(QStringLiteral("VL"), QStringLiteral("volume")); 0067 s_tagMap->insert(QStringLiteral("IS"), QStringLiteral("number")); 0068 s_tagMap->insert(QStringLiteral("PB"), QStringLiteral("publisher")); 0069 s_tagMap->insert(QStringLiteral("SN"), QStringLiteral("isbn")); 0070 s_tagMap->insert(QStringLiteral("AD"), QStringLiteral("address")); 0071 s_tagMap->insert(QStringLiteral("CY"), QStringLiteral("address")); 0072 s_tagMap->insert(QStringLiteral("UR"), QStringLiteral("url")); 0073 s_tagMap->insert(QStringLiteral("L1"), QStringLiteral("pdf")); 0074 s_tagMap->insert(QStringLiteral("T3"), QStringLiteral("series")); 0075 s_tagMap->insert(QStringLiteral("EP"), QStringLiteral("pages")); 0076 s_tagMap->insert(QStringLiteral("DO"), QStringLiteral("doi")); 0077 } 0078 } 0079 0080 // static 0081 void RISImporter::initTypeMap() { 0082 if(!s_typeMap) { 0083 s_typeMap = new QHash<QString, QString>(); 0084 // leave capitalized, except for bibtex types 0085 s_typeMap->insert(QStringLiteral("ABST"), QStringLiteral("Abstract")); 0086 s_typeMap->insert(QStringLiteral("ADVS"), QStringLiteral("Audiovisual material")); 0087 s_typeMap->insert(QStringLiteral("ART"), QStringLiteral("Art Work")); 0088 s_typeMap->insert(QStringLiteral("BILL"), QStringLiteral("Bill/Resolution")); 0089 s_typeMap->insert(QStringLiteral("BOOK"), QStringLiteral("book")); // bibtex 0090 s_typeMap->insert(QStringLiteral("CASE"), QStringLiteral("Case")); 0091 s_typeMap->insert(QStringLiteral("CHAP"), QStringLiteral("inbook")); // == "inbook" ? 0092 s_typeMap->insert(QStringLiteral("COMP"), QStringLiteral("Computer program")); 0093 s_typeMap->insert(QStringLiteral("CONF"), QStringLiteral("inproceedings")); // == "conference" ? 0094 s_typeMap->insert(QStringLiteral("CTLG"), QStringLiteral("Catalog")); 0095 s_typeMap->insert(QStringLiteral("DATA"), QStringLiteral("Data file")); 0096 s_typeMap->insert(QStringLiteral("ELEC"), QStringLiteral("Electronic Citation")); 0097 s_typeMap->insert(QStringLiteral("GEN"), QStringLiteral("Generic")); 0098 s_typeMap->insert(QStringLiteral("HEAR"), QStringLiteral("Hearing")); 0099 s_typeMap->insert(QStringLiteral("ICOMM"), QStringLiteral("Internet Communication")); 0100 s_typeMap->insert(QStringLiteral("INPR"), QStringLiteral("In Press")); 0101 s_typeMap->insert(QStringLiteral("JFULL"), QStringLiteral("Journal (full)")); // = "periodical" ? 0102 s_typeMap->insert(QStringLiteral("JOUR"), QStringLiteral("article")); // "Journal" 0103 s_typeMap->insert(QStringLiteral("MAP"), QStringLiteral("Map")); 0104 s_typeMap->insert(QStringLiteral("MGZN"), QStringLiteral("article")); // bibtex 0105 s_typeMap->insert(QStringLiteral("MPCT"), QStringLiteral("Motion picture")); 0106 s_typeMap->insert(QStringLiteral("MUSIC"), QStringLiteral("Music score")); 0107 s_typeMap->insert(QStringLiteral("NEWS"), QStringLiteral("Newspaper")); 0108 s_typeMap->insert(QStringLiteral("PAMP"), QStringLiteral("Pamphlet")); // = "booklet" ? 0109 s_typeMap->insert(QStringLiteral("PAT"), QStringLiteral("Patent")); 0110 s_typeMap->insert(QStringLiteral("PCOMM"), QStringLiteral("Personal communication")); 0111 s_typeMap->insert(QStringLiteral("RPRT"), QStringLiteral("Report")); // = "techreport" ? 0112 s_typeMap->insert(QStringLiteral("SER"), QStringLiteral("Serial (BookMonograph)")); 0113 s_typeMap->insert(QStringLiteral("SLIDE"), QStringLiteral("Slide")); 0114 s_typeMap->insert(QStringLiteral("SOUND"), QStringLiteral("Sound recording")); 0115 s_typeMap->insert(QStringLiteral("STAT"), QStringLiteral("Statute")); 0116 s_typeMap->insert(QStringLiteral("THES"), QStringLiteral("phdthesis")); // "mastersthesis" ? 0117 s_typeMap->insert(QStringLiteral("UNBILL"), QStringLiteral("Unenacted bill/resolution")); 0118 s_typeMap->insert(QStringLiteral("UNPB"), QStringLiteral("unpublished")); // bibtex 0119 s_typeMap->insert(QStringLiteral("VIDEO"), QStringLiteral("Video recording")); 0120 } 0121 } 0122 0123 RISImporter::RISImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) { 0124 initTagMap(); 0125 initTypeMap(); 0126 } 0127 0128 RISImporter::RISImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) { 0129 initTagMap(); 0130 initTypeMap(); 0131 } 0132 0133 bool RISImporter::canImport(int type) const { 0134 return type == Data::Collection::Bibtex; 0135 } 0136 0137 Tellico::Data::CollPtr RISImporter::collection() { 0138 if(m_coll) { 0139 return m_coll; 0140 } 0141 0142 m_coll = new Data::BibtexCollection(true); 0143 0144 QHash<QString, Data::FieldPtr> risFields; 0145 0146 // need to know if any extended properties in current collection point to RIS 0147 // if so, add to collection 0148 Data::CollPtr currColl = currentCollection(); 0149 if(currColl) { 0150 foreach(Data::FieldPtr field, currColl->fields()) { 0151 // continue if property is empty 0152 QString ris = field->property(QStringLiteral("ris")); 0153 if(ris.isEmpty()) { 0154 continue; 0155 } 0156 // if current collection has one with the same name, set the property 0157 Data::FieldPtr f = m_coll->fieldByName(field->name()); 0158 if(!f) { 0159 f = new Data::Field(*field); 0160 m_coll->addField(f); 0161 } 0162 f->setProperty(QStringLiteral("ris"), ris); 0163 risFields.insert(ris, f); 0164 } 0165 } 0166 emit signalTotalSteps(this, urls().count() * 100); 0167 0168 if(text().isEmpty()) { 0169 int count = 0; 0170 foreach(const QUrl& url, urls()) { 0171 if(m_cancelled) { 0172 break; 0173 } 0174 readURL(url, count, risFields); 0175 ++count; 0176 } 0177 } else { 0178 readText(text(), 0, risFields); 0179 } 0180 0181 if(m_cancelled) { 0182 m_coll = Data::CollPtr(); 0183 } 0184 return m_coll; 0185 } 0186 0187 void RISImporter::readURL(const QUrl& url_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) { 0188 QString str = FileHandler::readTextFile(url_); 0189 if(str.isEmpty()) { 0190 return; 0191 } 0192 readText(str, n, risFields_); 0193 } 0194 0195 void RISImporter::readText(const QString& text_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) { 0196 ISBNValidator isbnval(this); 0197 0198 QString text = text_; 0199 QTextStream t(&text); 0200 0201 const uint length = text.length(); 0202 const uint stepSize = qMax(s_stepSize, length/100); 0203 const bool showProgress = options() & ImportProgress; 0204 0205 bool needToAddFinal = false; 0206 0207 QString sp, ep; 0208 0209 uint j = 0; 0210 Data::EntryPtr entry(new Data::Entry(m_coll)); 0211 // technically, the spec requires a space immediately after the hyphen 0212 // however, at least one website (Springer) outputs RIS with no space after the final "ER -" 0213 // so just strip the white space later 0214 // also be gracious and allow any amount of space before hyphen 0215 static const QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$")); 0216 QString currLine, nextLine; 0217 for(currLine = t.readLine(); !m_cancelled && !t.atEnd(); currLine = nextLine, j += currLine.length()) { 0218 nextLine = t.readLine(); 0219 QRegularExpressionMatch m = rx.match(currLine); 0220 QString tag = m.captured(1); 0221 QString value = m.captured(2).trimmed(); 0222 if(tag.isEmpty()) { 0223 continue; 0224 } 0225 // myDebug() << tag << ": " << value; 0226 // if the next line is not empty and does not match start regexp, append to value 0227 while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) { 0228 value += nextLine.trimmed(); 0229 nextLine = t.readLine(); 0230 } 0231 0232 // every entry ends with "ER" 0233 if(tag == QLatin1String("ER")) { 0234 m_coll->addEntries(entry); 0235 entry = new Data::Entry(m_coll); 0236 needToAddFinal = false; 0237 continue; 0238 } else if(tag == QLatin1String("TY") && s_typeMap->contains(value)) { 0239 // for entry-type, switch it to normalized type name 0240 value = (*s_typeMap)[value]; 0241 } else if(tag == QLatin1String("SN")) { 0242 // test for valid isbn, sometimes the issn gets stuck here 0243 int pos = 0; 0244 if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) { 0245 continue; 0246 } 0247 } else if(tag == QLatin1String("SP")) { 0248 sp = value; 0249 if(!ep.isEmpty()) { 0250 int startPage = sp.toInt(); 0251 int endPage = ep.toInt(); 0252 if(endPage > 0 && endPage < startPage) { 0253 myWarning() << "Assuming end page is really page count"; 0254 ep = QString::number(startPage + endPage); 0255 } 0256 value = sp + QLatin1Char('-') + ep; 0257 tag = QStringLiteral("EP"); 0258 sp.clear(); 0259 ep.clear(); 0260 } else { 0261 // nothing else to do 0262 continue; 0263 } 0264 } else if(tag == QLatin1String("EP")) { 0265 ep = value; 0266 if(!sp.isEmpty()) { 0267 int startPage = sp.toInt(); 0268 int endPage = ep.toInt(); 0269 if(endPage > 0 && endPage < startPage) { 0270 myWarning() << "Assuming end page is really page count"; 0271 ep = QString::number(startPage + endPage); 0272 } 0273 value = sp + QLatin1Char('-') + ep; 0274 sp.clear(); 0275 ep.clear(); 0276 } else { 0277 continue; 0278 } 0279 } else if(s_tagMap->value(tag) == QLatin1String("year")) { // for now, just grab the year 0280 value = value.section(QLatin1Char('/'), 0, 0); 0281 } 0282 0283 // the lookup scheme is: 0284 // 1. any field has an RIS property that matches the tag name 0285 // 2. default field mapping tag -> field name 0286 Data::FieldPtr f = risFields_[tag]; 0287 if(!f) { 0288 // special case for BT 0289 // primary title for books, secondary for everything else 0290 if(tag == QLatin1String("BT")) { 0291 if(entry->field(QStringLiteral("entry-type")) == QLatin1String("book")) { 0292 f = m_coll->fieldByName(QStringLiteral("title")); 0293 } else { 0294 f = m_coll->fieldByName(QStringLiteral("booktitle")); 0295 } 0296 } else { 0297 f = fieldByTag(tag); 0298 } 0299 } 0300 if(!f) { 0301 continue; 0302 } 0303 needToAddFinal = true; 0304 0305 // harmless for non-choice fields 0306 // for entry-type, want it in lower case 0307 f->addAllowed(value); 0308 // if the field can have multiple values, append current values to new value 0309 if(f->hasFlag(Data::Field::AllowMultiple) && !entry->field(f).isEmpty()) { 0310 value.prepend(entry->field(f) + FieldFormat::delimiterString()); 0311 } 0312 entry->setField(f, value); 0313 0314 if(showProgress && j%stepSize == 0) { 0315 emit signalProgress(this, n*100 + 100*j/length); 0316 } 0317 } 0318 0319 if(needToAddFinal) { 0320 m_coll->addEntries(entry); 0321 } 0322 } 0323 0324 Tellico::Data::FieldPtr RISImporter::fieldByTag(const QString& tag_) { 0325 Data::FieldPtr f; 0326 const QString& fieldTag = (*s_tagMap)[tag_]; 0327 if(!fieldTag.isEmpty()) { 0328 f = m_coll->fieldByName(fieldTag); 0329 if(f) { 0330 f->setProperty(QStringLiteral("ris"), tag_); 0331 return f; 0332 } 0333 } 0334 0335 // add non-default fields if not already there 0336 if(tag_== QLatin1String("L1")) { 0337 f = new Data::Field(QStringLiteral("pdf"), i18n("PDF"), Data::Field::URL); 0338 f->setProperty(QStringLiteral("ris"), QStringLiteral("L1")); 0339 f->setCategory(i18n("Miscellaneous")); 0340 } 0341 m_coll->addField(f); 0342 return f; 0343 } 0344 0345 void RISImporter::slotCancel() { 0346 m_cancelled = true; 0347 } 0348 0349 bool RISImporter::maybeRIS(const QUrl& url_) { 0350 QString text = FileHandler::readTextFile(url_, true /*quiet*/); 0351 if(text.isEmpty()) { 0352 return false; 0353 } 0354 0355 // bare bones check, strip white space at beginning 0356 // and then first text line must be valid RIS 0357 QTextStream t(&text); 0358 0359 static const QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$")); 0360 QString currLine; 0361 for(currLine = t.readLine(); !t.atEnd(); currLine = t.readLine()) { 0362 if(currLine.trimmed().isEmpty()) { 0363 continue; 0364 } 0365 break; 0366 } 0367 return rx.match(currLine).hasMatch(); 0368 }