File indexing completed on 2024-05-12 16:46:33
0001 /*************************************************************************** 0002 Copyright (C) 2004-2009 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "risimporter.h" 0026 #include "../collections/bibtexcollection.h" 0027 #include "../entry.h" 0028 #include "../field.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../utils/isbnvalidator.h" 0032 #include "../tellico_debug.h" 0033 0034 #include <KLocalizedString> 0035 0036 #include <QRegularExpression> 0037 #include <QTextStream> 0038 0039 using Tellico::Import::RISImporter; 0040 QHash<QString, QString>* RISImporter::s_tagMap = nullptr; 0041 QHash<QString, QString>* RISImporter::s_typeMap = nullptr; 0042 0043 // static 0044 void RISImporter::initTagMap() { 0045 if(!s_tagMap) { 0046 s_tagMap = new QHash<QString, QString>(); 0047 // BT is special and is handled separately 0048 s_tagMap->insert(QStringLiteral("TY"), QStringLiteral("entry-type")); 0049 s_tagMap->insert(QStringLiteral("ID"), QStringLiteral("bibtex-key")); 0050 s_tagMap->insert(QStringLiteral("T1"), QStringLiteral("title")); 0051 s_tagMap->insert(QStringLiteral("TI"), QStringLiteral("title")); 0052 s_tagMap->insert(QStringLiteral("T2"), QStringLiteral("booktitle")); 0053 s_tagMap->insert(QStringLiteral("A1"), QStringLiteral("author")); 0054 s_tagMap->insert(QStringLiteral("AU"), QStringLiteral("author")); 0055 s_tagMap->insert(QStringLiteral("ED"), QStringLiteral("editor")); 0056 s_tagMap->insert(QStringLiteral("YR"), QStringLiteral("year")); 0057 s_tagMap->insert(QStringLiteral("PY"), QStringLiteral("year")); 0058 s_tagMap->insert(QStringLiteral("N1"), QStringLiteral("note")); 0059 s_tagMap->insert(QStringLiteral("AB"), QStringLiteral("abstract")); // should be note? 0060 s_tagMap->insert(QStringLiteral("N2"), QStringLiteral("abstract")); 0061 s_tagMap->insert(QStringLiteral("KW"), QStringLiteral("keyword")); 0062 s_tagMap->insert(QStringLiteral("JF"), QStringLiteral("journal")); 0063 s_tagMap->insert(QStringLiteral("JO"), QStringLiteral("journal")); 0064 s_tagMap->insert(QStringLiteral("JA"), QStringLiteral("journal")); 0065 s_tagMap->insert(QStringLiteral("VL"), QStringLiteral("volume")); 0066 s_tagMap->insert(QStringLiteral("IS"), QStringLiteral("number")); 0067 s_tagMap->insert(QStringLiteral("PB"), QStringLiteral("publisher")); 0068 s_tagMap->insert(QStringLiteral("SN"), QStringLiteral("isbn")); 0069 s_tagMap->insert(QStringLiteral("AD"), QStringLiteral("address")); 0070 s_tagMap->insert(QStringLiteral("CY"), QStringLiteral("address")); 0071 s_tagMap->insert(QStringLiteral("UR"), QStringLiteral("url")); 0072 s_tagMap->insert(QStringLiteral("L1"), QStringLiteral("pdf")); 0073 s_tagMap->insert(QStringLiteral("T3"), QStringLiteral("series")); 0074 s_tagMap->insert(QStringLiteral("EP"), QStringLiteral("pages")); 0075 } 0076 } 0077 0078 // static 0079 void RISImporter::initTypeMap() { 0080 if(!s_typeMap) { 0081 s_typeMap = new QHash<QString, QString>(); 0082 // leave capitalized, except for bibtex types 0083 s_typeMap->insert(QStringLiteral("ABST"), QStringLiteral("Abstract")); 0084 s_typeMap->insert(QStringLiteral("ADVS"), QStringLiteral("Audiovisual material")); 0085 s_typeMap->insert(QStringLiteral("ART"), QStringLiteral("Art Work")); 0086 s_typeMap->insert(QStringLiteral("BILL"), QStringLiteral("Bill/Resolution")); 0087 s_typeMap->insert(QStringLiteral("BOOK"), QStringLiteral("book")); // bibtex 0088 s_typeMap->insert(QStringLiteral("CASE"), QStringLiteral("Case")); 0089 s_typeMap->insert(QStringLiteral("CHAP"), QStringLiteral("inbook")); // == "inbook" ? 0090 s_typeMap->insert(QStringLiteral("COMP"), QStringLiteral("Computer program")); 0091 s_typeMap->insert(QStringLiteral("CONF"), QStringLiteral("inproceedings")); // == "conference" ? 0092 s_typeMap->insert(QStringLiteral("CTLG"), QStringLiteral("Catalog")); 0093 s_typeMap->insert(QStringLiteral("DATA"), QStringLiteral("Data file")); 0094 s_typeMap->insert(QStringLiteral("ELEC"), QStringLiteral("Electronic Citation")); 0095 s_typeMap->insert(QStringLiteral("GEN"), QStringLiteral("Generic")); 0096 s_typeMap->insert(QStringLiteral("HEAR"), QStringLiteral("Hearing")); 0097 s_typeMap->insert(QStringLiteral("ICOMM"), QStringLiteral("Internet Communication")); 0098 s_typeMap->insert(QStringLiteral("INPR"), QStringLiteral("In Press")); 0099 s_typeMap->insert(QStringLiteral("JFULL"), QStringLiteral("Journal (full)")); // = "periodical" ? 0100 s_typeMap->insert(QStringLiteral("JOUR"), QStringLiteral("article")); // "Journal" 0101 s_typeMap->insert(QStringLiteral("MAP"), QStringLiteral("Map")); 0102 s_typeMap->insert(QStringLiteral("MGZN"), QStringLiteral("article")); // bibtex 0103 s_typeMap->insert(QStringLiteral("MPCT"), QStringLiteral("Motion picture")); 0104 s_typeMap->insert(QStringLiteral("MUSIC"), QStringLiteral("Music score")); 0105 s_typeMap->insert(QStringLiteral("NEWS"), QStringLiteral("Newspaper")); 0106 s_typeMap->insert(QStringLiteral("PAMP"), QStringLiteral("Pamphlet")); // = "booklet" ? 0107 s_typeMap->insert(QStringLiteral("PAT"), QStringLiteral("Patent")); 0108 s_typeMap->insert(QStringLiteral("PCOMM"), QStringLiteral("Personal communication")); 0109 s_typeMap->insert(QStringLiteral("RPRT"), QStringLiteral("Report")); // = "techreport" ? 0110 s_typeMap->insert(QStringLiteral("SER"), QStringLiteral("Serial (BookMonograph)")); 0111 s_typeMap->insert(QStringLiteral("SLIDE"), QStringLiteral("Slide")); 0112 s_typeMap->insert(QStringLiteral("SOUND"), QStringLiteral("Sound recording")); 0113 s_typeMap->insert(QStringLiteral("STAT"), QStringLiteral("Statute")); 0114 s_typeMap->insert(QStringLiteral("THES"), QStringLiteral("phdthesis")); // "mastersthesis" ? 0115 s_typeMap->insert(QStringLiteral("UNBILL"), QStringLiteral("Unenacted bill/resolution")); 0116 s_typeMap->insert(QStringLiteral("UNPB"), QStringLiteral("unpublished")); // bibtex 0117 s_typeMap->insert(QStringLiteral("VIDEO"), QStringLiteral("Video recording")); 0118 } 0119 } 0120 0121 RISImporter::RISImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) { 0122 initTagMap(); 0123 initTypeMap(); 0124 } 0125 0126 RISImporter::RISImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) { 0127 initTagMap(); 0128 initTypeMap(); 0129 } 0130 0131 bool RISImporter::canImport(int type) const { 0132 return type == Data::Collection::Bibtex; 0133 } 0134 0135 Tellico::Data::CollPtr RISImporter::collection() { 0136 if(m_coll) { 0137 return m_coll; 0138 } 0139 0140 m_coll = new Data::BibtexCollection(true); 0141 0142 QHash<QString, Data::FieldPtr> risFields; 0143 0144 // need to know if any extended properties in current collection point to RIS 0145 // if so, add to collection 0146 Data::CollPtr currColl = currentCollection(); 0147 if(currColl) { 0148 foreach(Data::FieldPtr field, currColl->fields()) { 0149 // continue if property is empty 0150 QString ris = field->property(QStringLiteral("ris")); 0151 if(ris.isEmpty()) { 0152 continue; 0153 } 0154 // if current collection has one with the same name, set the property 0155 Data::FieldPtr f = m_coll->fieldByName(field->name()); 0156 if(!f) { 0157 f = new Data::Field(*field); 0158 m_coll->addField(f); 0159 } 0160 f->setProperty(QStringLiteral("ris"), ris); 0161 risFields.insert(ris, f); 0162 } 0163 } 0164 emit signalTotalSteps(this, urls().count() * 100); 0165 0166 if(text().isEmpty()) { 0167 int count = 0; 0168 foreach(const QUrl& url, urls()) { 0169 if(m_cancelled) { 0170 break; 0171 } 0172 readURL(url, count, risFields); 0173 ++count; 0174 } 0175 } else { 0176 readText(text(), 0, risFields); 0177 } 0178 0179 if(m_cancelled) { 0180 m_coll = Data::CollPtr(); 0181 } 0182 return m_coll; 0183 } 0184 0185 void RISImporter::readURL(const QUrl& url_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) { 0186 QString str = FileHandler::readTextFile(url_); 0187 if(str.isEmpty()) { 0188 return; 0189 } 0190 readText(str, n, risFields_); 0191 } 0192 0193 void RISImporter::readText(const QString& text_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) { 0194 ISBNValidator isbnval(this); 0195 0196 QString text = text_; 0197 QTextStream t(&text); 0198 0199 const uint length = text.length(); 0200 const uint stepSize = qMax(s_stepSize, length/100); 0201 const bool showProgress = options() & ImportProgress; 0202 0203 bool needToAddFinal = false; 0204 0205 QString sp, ep; 0206 0207 uint j = 0; 0208 Data::EntryPtr entry(new Data::Entry(m_coll)); 0209 // technically, the spec requires a space immediately after the hyphen 0210 // however, at least one website (Springer) outputs RIS with no space after the final "ER -" 0211 // so just strip the white space later 0212 // also be gracious and allow any amount of space before hyphen 0213 const QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$")); 0214 QString currLine, nextLine; 0215 for(currLine = t.readLine(); !m_cancelled && !t.atEnd(); currLine = nextLine, j += currLine.length()) { 0216 nextLine = t.readLine(); 0217 QRegularExpressionMatch m = rx.match(currLine); 0218 QString tag = m.captured(1); 0219 QString value = m.captured(2).trimmed(); 0220 if(tag.isEmpty()) { 0221 continue; 0222 } 0223 // myDebug() << tag << ": " << value; 0224 // if the next line is not empty and does not match start regexp, append to value 0225 while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) { 0226 value += nextLine.trimmed(); 0227 nextLine = t.readLine(); 0228 } 0229 0230 // every entry ends with "ER" 0231 if(tag == QLatin1String("ER")) { 0232 m_coll->addEntries(entry); 0233 entry = new Data::Entry(m_coll); 0234 needToAddFinal = false; 0235 continue; 0236 } else if(tag == QLatin1String("TY") && s_typeMap->contains(value)) { 0237 // for entry-type, switch it to normalized type name 0238 value = (*s_typeMap)[value]; 0239 } else if(tag == QLatin1String("SN")) { 0240 // test for valid isbn, sometimes the issn gets stuck here 0241 int pos = 0; 0242 if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) { 0243 continue; 0244 } 0245 } else if(tag == QLatin1String("SP")) { 0246 sp = value; 0247 if(!ep.isEmpty()) { 0248 int startPage = sp.toInt(); 0249 int endPage = ep.toInt(); 0250 if(endPage > 0 && endPage < startPage) { 0251 myWarning() << "Assuming end page is really page count"; 0252 ep = QString::number(startPage + endPage); 0253 } 0254 value = sp + QLatin1Char('-') + ep; 0255 tag = QStringLiteral("EP"); 0256 sp.clear(); 0257 ep.clear(); 0258 } else { 0259 // nothing else to do 0260 continue; 0261 } 0262 } else if(tag == QLatin1String("EP")) { 0263 ep = value; 0264 if(!sp.isEmpty()) { 0265 int startPage = sp.toInt(); 0266 int endPage = ep.toInt(); 0267 if(endPage > 0 && endPage < startPage) { 0268 myWarning() << "Assuming end page is really page count"; 0269 ep = QString::number(startPage + endPage); 0270 } 0271 value = sp + QLatin1Char('-') + ep; 0272 sp.clear(); 0273 ep.clear(); 0274 } else { 0275 continue; 0276 } 0277 } else if(tag == QLatin1String("YR") || tag == QLatin1String("PY")) { // for now, just grab the year 0278 value = value.section(QLatin1Char('/'), 0, 0); 0279 } 0280 0281 // the lookup scheme is: 0282 // 1. any field has an RIS property that matches the tag name 0283 // 2. default field mapping tag -> field name 0284 Data::FieldPtr f = risFields_[tag]; 0285 if(!f) { 0286 // special case for BT 0287 // primary title for books, secondary for everything else 0288 if(tag == QLatin1String("BT")) { 0289 if(entry->field(QStringLiteral("entry-type")) == QLatin1String("book")) { 0290 f = m_coll->fieldByName(QStringLiteral("title")); 0291 } else { 0292 f = m_coll->fieldByName(QStringLiteral("booktitle")); 0293 } 0294 } else { 0295 f = fieldByTag(tag); 0296 } 0297 } 0298 if(!f) { 0299 continue; 0300 } 0301 needToAddFinal = true; 0302 0303 // harmless for non-choice fields 0304 // for entry-type, want it in lower case 0305 f->addAllowed(value); 0306 // if the field can have multiple values, append current values to new value 0307 if(f->hasFlag(Data::Field::AllowMultiple) && !entry->field(f).isEmpty()) { 0308 value.prepend(entry->field(f) + FieldFormat::delimiterString()); 0309 } 0310 entry->setField(f, value); 0311 0312 if(showProgress && j%stepSize == 0) { 0313 emit signalProgress(this, n*100 + 100*j/length); 0314 } 0315 } 0316 0317 if(needToAddFinal) { 0318 m_coll->addEntries(entry); 0319 } 0320 } 0321 0322 Tellico::Data::FieldPtr RISImporter::fieldByTag(const QString& tag_) { 0323 Data::FieldPtr f; 0324 const QString& fieldTag = (*s_tagMap)[tag_]; 0325 if(!fieldTag.isEmpty()) { 0326 f = m_coll->fieldByName(fieldTag); 0327 if(f) { 0328 f->setProperty(QStringLiteral("ris"), tag_); 0329 return f; 0330 } 0331 } 0332 0333 // add non-default fields if not already there 0334 if(tag_== QLatin1String("L1")) { 0335 f = new Data::Field(QStringLiteral("pdf"), i18n("PDF"), Data::Field::URL); 0336 f->setProperty(QStringLiteral("ris"), QStringLiteral("L1")); 0337 f->setCategory(i18n("Miscellaneous")); 0338 } 0339 m_coll->addField(f); 0340 return f; 0341 } 0342 0343 void RISImporter::slotCancel() { 0344 m_cancelled = true; 0345 } 0346 0347 bool RISImporter::maybeRIS(const QUrl& url_) { 0348 QString text = FileHandler::readTextFile(url_, true /*quiet*/); 0349 if(text.isEmpty()) { 0350 return false; 0351 } 0352 0353 // bare bones check, strip white space at beginning 0354 // and then first text line must be valid RIS 0355 QTextStream t(&text); 0356 0357 QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$")); 0358 QString currLine; 0359 for(currLine = t.readLine(); !t.atEnd(); currLine = t.readLine()) { 0360 if(currLine.trimmed().isEmpty()) { 0361 continue; 0362 } 0363 break; 0364 } 0365 return rx.match(currLine).hasMatch(); 0366 }