File indexing completed on 2024-05-12 05:10:13

0001 /***************************************************************************
0002     Copyright (C) 2004-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "risimporter.h"
0026 #include "../collections/bibtexcollection.h"
0027 #include "../entry.h"
0028 #include "../field.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../utils/isbnvalidator.h"
0032 #include "../tellico_debug.h"
0033 
0034 #include <KLocalizedString>
0035 
0036 #include <QRegularExpression>
0037 #include <QTextStream>
0038 
0039 using Tellico::Import::RISImporter;
0040 QHash<QString, QString>* RISImporter::s_tagMap = nullptr;
0041 QHash<QString, QString>* RISImporter::s_typeMap = nullptr;
0042 
0043 // static
0044 void RISImporter::initTagMap() {
0045   if(!s_tagMap) {
0046     s_tagMap = new QHash<QString, QString>();
0047     // BT is special and is handled separately
0048     s_tagMap->insert(QStringLiteral("TY"), QStringLiteral("entry-type"));
0049     s_tagMap->insert(QStringLiteral("ID"), QStringLiteral("bibtex-key"));
0050     s_tagMap->insert(QStringLiteral("T1"), QStringLiteral("title"));
0051     s_tagMap->insert(QStringLiteral("TI"), QStringLiteral("title"));
0052     s_tagMap->insert(QStringLiteral("T2"), QStringLiteral("booktitle"));
0053     s_tagMap->insert(QStringLiteral("A1"), QStringLiteral("author"));
0054     s_tagMap->insert(QStringLiteral("AU"), QStringLiteral("author"));
0055     s_tagMap->insert(QStringLiteral("ED"), QStringLiteral("editor"));
0056     s_tagMap->insert(QStringLiteral("YR"), QStringLiteral("year"));
0057     s_tagMap->insert(QStringLiteral("PY"), QStringLiteral("year"));
0058     s_tagMap->insert(QStringLiteral("Y1"), QStringLiteral("year"));
0059     s_tagMap->insert(QStringLiteral("N1"), QStringLiteral("note"));
0060     s_tagMap->insert(QStringLiteral("AB"), QStringLiteral("abstract")); // should be note?
0061     s_tagMap->insert(QStringLiteral("N2"), QStringLiteral("abstract"));
0062     s_tagMap->insert(QStringLiteral("KW"), QStringLiteral("keyword"));
0063     s_tagMap->insert(QStringLiteral("JF"), QStringLiteral("journal"));
0064     s_tagMap->insert(QStringLiteral("JO"), QStringLiteral("journal"));
0065     s_tagMap->insert(QStringLiteral("JA"), QStringLiteral("journal"));
0066     s_tagMap->insert(QStringLiteral("VL"), QStringLiteral("volume"));
0067     s_tagMap->insert(QStringLiteral("IS"), QStringLiteral("number"));
0068     s_tagMap->insert(QStringLiteral("PB"), QStringLiteral("publisher"));
0069     s_tagMap->insert(QStringLiteral("SN"), QStringLiteral("isbn"));
0070     s_tagMap->insert(QStringLiteral("AD"), QStringLiteral("address"));
0071     s_tagMap->insert(QStringLiteral("CY"), QStringLiteral("address"));
0072     s_tagMap->insert(QStringLiteral("UR"), QStringLiteral("url"));
0073     s_tagMap->insert(QStringLiteral("L1"), QStringLiteral("pdf"));
0074     s_tagMap->insert(QStringLiteral("T3"), QStringLiteral("series"));
0075     s_tagMap->insert(QStringLiteral("EP"), QStringLiteral("pages"));
0076     s_tagMap->insert(QStringLiteral("DO"), QStringLiteral("doi"));
0077   }
0078 }
0079 
0080 // static
0081 void RISImporter::initTypeMap() {
0082   if(!s_typeMap) {
0083     s_typeMap = new QHash<QString, QString>();
0084     // leave capitalized, except for bibtex types
0085     s_typeMap->insert(QStringLiteral("ABST"),   QStringLiteral("Abstract"));
0086     s_typeMap->insert(QStringLiteral("ADVS"),   QStringLiteral("Audiovisual material"));
0087     s_typeMap->insert(QStringLiteral("ART"),    QStringLiteral("Art Work"));
0088     s_typeMap->insert(QStringLiteral("BILL"),   QStringLiteral("Bill/Resolution"));
0089     s_typeMap->insert(QStringLiteral("BOOK"),   QStringLiteral("book")); // bibtex
0090     s_typeMap->insert(QStringLiteral("CASE"),   QStringLiteral("Case"));
0091     s_typeMap->insert(QStringLiteral("CHAP"),   QStringLiteral("inbook")); // == "inbook" ?
0092     s_typeMap->insert(QStringLiteral("COMP"),   QStringLiteral("Computer program"));
0093     s_typeMap->insert(QStringLiteral("CONF"),   QStringLiteral("inproceedings")); // == "conference" ?
0094     s_typeMap->insert(QStringLiteral("CTLG"),   QStringLiteral("Catalog"));
0095     s_typeMap->insert(QStringLiteral("DATA"),   QStringLiteral("Data file"));
0096     s_typeMap->insert(QStringLiteral("ELEC"),   QStringLiteral("Electronic Citation"));
0097     s_typeMap->insert(QStringLiteral("GEN"),    QStringLiteral("Generic"));
0098     s_typeMap->insert(QStringLiteral("HEAR"),   QStringLiteral("Hearing"));
0099     s_typeMap->insert(QStringLiteral("ICOMM"),  QStringLiteral("Internet Communication"));
0100     s_typeMap->insert(QStringLiteral("INPR"),   QStringLiteral("In Press"));
0101     s_typeMap->insert(QStringLiteral("JFULL"),  QStringLiteral("Journal (full)")); // = "periodical" ?
0102     s_typeMap->insert(QStringLiteral("JOUR"),   QStringLiteral("article")); // "Journal"
0103     s_typeMap->insert(QStringLiteral("MAP"),    QStringLiteral("Map"));
0104     s_typeMap->insert(QStringLiteral("MGZN"),   QStringLiteral("article")); // bibtex
0105     s_typeMap->insert(QStringLiteral("MPCT"),   QStringLiteral("Motion picture"));
0106     s_typeMap->insert(QStringLiteral("MUSIC"),  QStringLiteral("Music score"));
0107     s_typeMap->insert(QStringLiteral("NEWS"),   QStringLiteral("Newspaper"));
0108     s_typeMap->insert(QStringLiteral("PAMP"),   QStringLiteral("Pamphlet")); // = "booklet" ?
0109     s_typeMap->insert(QStringLiteral("PAT"),    QStringLiteral("Patent"));
0110     s_typeMap->insert(QStringLiteral("PCOMM"),  QStringLiteral("Personal communication"));
0111     s_typeMap->insert(QStringLiteral("RPRT"),   QStringLiteral("Report")); // = "techreport" ?
0112     s_typeMap->insert(QStringLiteral("SER"),    QStringLiteral("Serial (BookMonograph)"));
0113     s_typeMap->insert(QStringLiteral("SLIDE"),  QStringLiteral("Slide"));
0114     s_typeMap->insert(QStringLiteral("SOUND"),  QStringLiteral("Sound recording"));
0115     s_typeMap->insert(QStringLiteral("STAT"),   QStringLiteral("Statute"));
0116     s_typeMap->insert(QStringLiteral("THES"),   QStringLiteral("phdthesis")); // "mastersthesis" ?
0117     s_typeMap->insert(QStringLiteral("UNBILL"), QStringLiteral("Unenacted bill/resolution"));
0118     s_typeMap->insert(QStringLiteral("UNPB"),   QStringLiteral("unpublished")); // bibtex
0119     s_typeMap->insert(QStringLiteral("VIDEO"),  QStringLiteral("Video recording"));
0120   }
0121 }
0122 
0123 RISImporter::RISImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) {
0124   initTagMap();
0125   initTypeMap();
0126 }
0127 
0128 RISImporter::RISImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) {
0129   initTagMap();
0130   initTypeMap();
0131 }
0132 
0133 bool RISImporter::canImport(int type) const {
0134   return type == Data::Collection::Bibtex;
0135 }
0136 
0137 Tellico::Data::CollPtr RISImporter::collection() {
0138   if(m_coll) {
0139     return m_coll;
0140   }
0141 
0142   m_coll = new Data::BibtexCollection(true);
0143 
0144   QHash<QString, Data::FieldPtr> risFields;
0145 
0146   // need to know if any extended properties in current collection point to RIS
0147   // if so, add to collection
0148   Data::CollPtr currColl = currentCollection();
0149   if(currColl) {
0150     foreach(Data::FieldPtr field, currColl->fields()) {
0151       // continue if property is empty
0152       QString ris = field->property(QStringLiteral("ris"));
0153       if(ris.isEmpty()) {
0154         continue;
0155       }
0156       // if current collection has one with the same name, set the property
0157       Data::FieldPtr f = m_coll->fieldByName(field->name());
0158       if(!f) {
0159         f = new Data::Field(*field);
0160         m_coll->addField(f);
0161       }
0162       f->setProperty(QStringLiteral("ris"), ris);
0163       risFields.insert(ris, f);
0164     }
0165   }
0166   emit signalTotalSteps(this, urls().count() * 100);
0167 
0168   if(text().isEmpty()) {
0169     int count = 0;
0170     foreach(const QUrl& url, urls()) {
0171       if(m_cancelled)  {
0172         break;
0173       }
0174       readURL(url, count, risFields);
0175       ++count;
0176     }
0177   } else {
0178     readText(text(), 0, risFields);
0179   }
0180 
0181   if(m_cancelled) {
0182     m_coll = Data::CollPtr();
0183   }
0184   return m_coll;
0185 }
0186 
0187 void RISImporter::readURL(const QUrl& url_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) {
0188   QString str = FileHandler::readTextFile(url_);
0189   if(str.isEmpty()) {
0190     return;
0191   }
0192   readText(str, n, risFields_);
0193 }
0194 
0195 void RISImporter::readText(const QString& text_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) {
0196   ISBNValidator isbnval(this);
0197 
0198   QString text = text_;
0199   QTextStream t(&text);
0200 
0201   const uint length = text.length();
0202   const uint stepSize = qMax(s_stepSize, length/100);
0203   const bool showProgress = options() & ImportProgress;
0204 
0205   bool needToAddFinal = false;
0206 
0207   QString sp, ep;
0208 
0209   uint j = 0;
0210   Data::EntryPtr entry(new Data::Entry(m_coll));
0211   // technically, the spec requires a space immediately after the hyphen
0212   // however, at least one website (Springer) outputs RIS with no space after the final "ER -"
0213   // so just strip the white space later
0214   // also be gracious and allow any amount of space before hyphen
0215   static const QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$"));
0216   QString currLine, nextLine;
0217   for(currLine = t.readLine(); !m_cancelled && !t.atEnd(); currLine = nextLine, j += currLine.length()) {
0218     nextLine = t.readLine();
0219     QRegularExpressionMatch m = rx.match(currLine);
0220     QString tag = m.captured(1);
0221     QString value = m.captured(2).trimmed();
0222     if(tag.isEmpty()) {
0223       continue;
0224     }
0225 //    myDebug() << tag << ": " << value;
0226     // if the next line is not empty and does not match start regexp, append to value
0227     while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) {
0228       value += nextLine.trimmed();
0229       nextLine = t.readLine();
0230     }
0231 
0232     // every entry ends with "ER"
0233     if(tag == QLatin1String("ER")) {
0234       m_coll->addEntries(entry);
0235       entry = new Data::Entry(m_coll);
0236       needToAddFinal = false;
0237       continue;
0238     } else if(tag == QLatin1String("TY") && s_typeMap->contains(value)) {
0239       // for entry-type, switch it to normalized type name
0240       value = (*s_typeMap)[value];
0241     } else if(tag == QLatin1String("SN")) {
0242       // test for valid isbn, sometimes the issn gets stuck here
0243       int pos = 0;
0244       if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) {
0245         continue;
0246       }
0247     } else if(tag == QLatin1String("SP")) {
0248       sp = value;
0249       if(!ep.isEmpty()) {
0250         int startPage = sp.toInt();
0251         int endPage = ep.toInt();
0252         if(endPage > 0 && endPage < startPage) {
0253           myWarning() << "Assuming end page is really page count";
0254           ep = QString::number(startPage + endPage);
0255         }
0256         value = sp + QLatin1Char('-') + ep;
0257         tag = QStringLiteral("EP");
0258         sp.clear();
0259         ep.clear();
0260       } else {
0261         // nothing else to do
0262         continue;
0263       }
0264     } else if(tag == QLatin1String("EP")) {
0265       ep = value;
0266       if(!sp.isEmpty()) {
0267         int startPage = sp.toInt();
0268         int endPage = ep.toInt();
0269         if(endPage > 0 && endPage < startPage) {
0270           myWarning() << "Assuming end page is really page count";
0271           ep = QString::number(startPage + endPage);
0272         }
0273         value = sp + QLatin1Char('-') + ep;
0274         sp.clear();
0275         ep.clear();
0276       } else {
0277         continue;
0278       }
0279     } else if(s_tagMap->value(tag) == QLatin1String("year")) {  // for now, just grab the year
0280       value = value.section(QLatin1Char('/'), 0, 0);
0281     }
0282 
0283     // the lookup scheme is:
0284     // 1. any field has an RIS property that matches the tag name
0285     // 2. default field mapping tag -> field name
0286     Data::FieldPtr f = risFields_[tag];
0287     if(!f) {
0288       // special case for BT
0289       // primary title for books, secondary for everything else
0290       if(tag == QLatin1String("BT")) {
0291         if(entry->field(QStringLiteral("entry-type")) == QLatin1String("book")) {
0292           f = m_coll->fieldByName(QStringLiteral("title"));
0293         } else {
0294           f = m_coll->fieldByName(QStringLiteral("booktitle"));
0295         }
0296       } else {
0297         f = fieldByTag(tag);
0298       }
0299     }
0300     if(!f) {
0301       continue;
0302     }
0303     needToAddFinal = true;
0304 
0305     // harmless for non-choice fields
0306     // for entry-type, want it in lower case
0307     f->addAllowed(value);
0308     // if the field can have multiple values, append current values to new value
0309     if(f->hasFlag(Data::Field::AllowMultiple) && !entry->field(f).isEmpty()) {
0310       value.prepend(entry->field(f) + FieldFormat::delimiterString());
0311     }
0312     entry->setField(f, value);
0313 
0314     if(showProgress && j%stepSize == 0) {
0315       emit signalProgress(this, n*100 + 100*j/length);
0316     }
0317   }
0318 
0319   if(needToAddFinal) {
0320     m_coll->addEntries(entry);
0321   }
0322 }
0323 
0324 Tellico::Data::FieldPtr RISImporter::fieldByTag(const QString& tag_) {
0325   Data::FieldPtr f;
0326   const QString& fieldTag = (*s_tagMap)[tag_];
0327   if(!fieldTag.isEmpty()) {
0328     f = m_coll->fieldByName(fieldTag);
0329     if(f) {
0330       f->setProperty(QStringLiteral("ris"), tag_);
0331       return f;
0332     }
0333   }
0334 
0335   // add non-default fields if not already there
0336   if(tag_== QLatin1String("L1")) {
0337     f = new Data::Field(QStringLiteral("pdf"), i18n("PDF"), Data::Field::URL);
0338     f->setProperty(QStringLiteral("ris"), QStringLiteral("L1"));
0339     f->setCategory(i18n("Miscellaneous"));
0340   }
0341   m_coll->addField(f);
0342   return f;
0343 }
0344 
0345 void RISImporter::slotCancel() {
0346   m_cancelled = true;
0347 }
0348 
0349 bool RISImporter::maybeRIS(const QUrl& url_) {
0350   QString text = FileHandler::readTextFile(url_, true /*quiet*/);
0351   if(text.isEmpty()) {
0352     return false;
0353   }
0354 
0355   // bare bones check, strip white space at beginning
0356   // and then first text line must be valid RIS
0357   QTextStream t(&text);
0358 
0359   static const QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$"));
0360   QString currLine;
0361   for(currLine = t.readLine(); !t.atEnd(); currLine = t.readLine()) {
0362     if(currLine.trimmed().isEmpty()) {
0363       continue;
0364     }
0365     break;
0366   }
0367   return rx.match(currLine).hasMatch();
0368 }