File indexing completed on 2024-05-12 16:46:33

0001 /***************************************************************************
0002     Copyright (C) 2004-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "risimporter.h"
0026 #include "../collections/bibtexcollection.h"
0027 #include "../entry.h"
0028 #include "../field.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../utils/isbnvalidator.h"
0032 #include "../tellico_debug.h"
0033 
0034 #include <KLocalizedString>
0035 
0036 #include <QRegularExpression>
0037 #include <QTextStream>
0038 
0039 using Tellico::Import::RISImporter;
0040 QHash<QString, QString>* RISImporter::s_tagMap = nullptr;
0041 QHash<QString, QString>* RISImporter::s_typeMap = nullptr;
0042 
0043 // static
0044 void RISImporter::initTagMap() {
0045   if(!s_tagMap) {
0046     s_tagMap = new QHash<QString, QString>();
0047     // BT is special and is handled separately
0048     s_tagMap->insert(QStringLiteral("TY"), QStringLiteral("entry-type"));
0049     s_tagMap->insert(QStringLiteral("ID"), QStringLiteral("bibtex-key"));
0050     s_tagMap->insert(QStringLiteral("T1"), QStringLiteral("title"));
0051     s_tagMap->insert(QStringLiteral("TI"), QStringLiteral("title"));
0052     s_tagMap->insert(QStringLiteral("T2"), QStringLiteral("booktitle"));
0053     s_tagMap->insert(QStringLiteral("A1"), QStringLiteral("author"));
0054     s_tagMap->insert(QStringLiteral("AU"), QStringLiteral("author"));
0055     s_tagMap->insert(QStringLiteral("ED"), QStringLiteral("editor"));
0056     s_tagMap->insert(QStringLiteral("YR"), QStringLiteral("year"));
0057     s_tagMap->insert(QStringLiteral("PY"), QStringLiteral("year"));
0058     s_tagMap->insert(QStringLiteral("N1"), QStringLiteral("note"));
0059     s_tagMap->insert(QStringLiteral("AB"), QStringLiteral("abstract")); // should be note?
0060     s_tagMap->insert(QStringLiteral("N2"), QStringLiteral("abstract"));
0061     s_tagMap->insert(QStringLiteral("KW"), QStringLiteral("keyword"));
0062     s_tagMap->insert(QStringLiteral("JF"), QStringLiteral("journal"));
0063     s_tagMap->insert(QStringLiteral("JO"), QStringLiteral("journal"));
0064     s_tagMap->insert(QStringLiteral("JA"), QStringLiteral("journal"));
0065     s_tagMap->insert(QStringLiteral("VL"), QStringLiteral("volume"));
0066     s_tagMap->insert(QStringLiteral("IS"), QStringLiteral("number"));
0067     s_tagMap->insert(QStringLiteral("PB"), QStringLiteral("publisher"));
0068     s_tagMap->insert(QStringLiteral("SN"), QStringLiteral("isbn"));
0069     s_tagMap->insert(QStringLiteral("AD"), QStringLiteral("address"));
0070     s_tagMap->insert(QStringLiteral("CY"), QStringLiteral("address"));
0071     s_tagMap->insert(QStringLiteral("UR"), QStringLiteral("url"));
0072     s_tagMap->insert(QStringLiteral("L1"), QStringLiteral("pdf"));
0073     s_tagMap->insert(QStringLiteral("T3"), QStringLiteral("series"));
0074     s_tagMap->insert(QStringLiteral("EP"), QStringLiteral("pages"));
0075   }
0076 }
0077 
0078 // static
0079 void RISImporter::initTypeMap() {
0080   if(!s_typeMap) {
0081     s_typeMap = new QHash<QString, QString>();
0082     // leave capitalized, except for bibtex types
0083     s_typeMap->insert(QStringLiteral("ABST"),   QStringLiteral("Abstract"));
0084     s_typeMap->insert(QStringLiteral("ADVS"),   QStringLiteral("Audiovisual material"));
0085     s_typeMap->insert(QStringLiteral("ART"),    QStringLiteral("Art Work"));
0086     s_typeMap->insert(QStringLiteral("BILL"),   QStringLiteral("Bill/Resolution"));
0087     s_typeMap->insert(QStringLiteral("BOOK"),   QStringLiteral("book")); // bibtex
0088     s_typeMap->insert(QStringLiteral("CASE"),   QStringLiteral("Case"));
0089     s_typeMap->insert(QStringLiteral("CHAP"),   QStringLiteral("inbook")); // == "inbook" ?
0090     s_typeMap->insert(QStringLiteral("COMP"),   QStringLiteral("Computer program"));
0091     s_typeMap->insert(QStringLiteral("CONF"),   QStringLiteral("inproceedings")); // == "conference" ?
0092     s_typeMap->insert(QStringLiteral("CTLG"),   QStringLiteral("Catalog"));
0093     s_typeMap->insert(QStringLiteral("DATA"),   QStringLiteral("Data file"));
0094     s_typeMap->insert(QStringLiteral("ELEC"),   QStringLiteral("Electronic Citation"));
0095     s_typeMap->insert(QStringLiteral("GEN"),    QStringLiteral("Generic"));
0096     s_typeMap->insert(QStringLiteral("HEAR"),   QStringLiteral("Hearing"));
0097     s_typeMap->insert(QStringLiteral("ICOMM"),  QStringLiteral("Internet Communication"));
0098     s_typeMap->insert(QStringLiteral("INPR"),   QStringLiteral("In Press"));
0099     s_typeMap->insert(QStringLiteral("JFULL"),  QStringLiteral("Journal (full)")); // = "periodical" ?
0100     s_typeMap->insert(QStringLiteral("JOUR"),   QStringLiteral("article")); // "Journal"
0101     s_typeMap->insert(QStringLiteral("MAP"),    QStringLiteral("Map"));
0102     s_typeMap->insert(QStringLiteral("MGZN"),   QStringLiteral("article")); // bibtex
0103     s_typeMap->insert(QStringLiteral("MPCT"),   QStringLiteral("Motion picture"));
0104     s_typeMap->insert(QStringLiteral("MUSIC"),  QStringLiteral("Music score"));
0105     s_typeMap->insert(QStringLiteral("NEWS"),   QStringLiteral("Newspaper"));
0106     s_typeMap->insert(QStringLiteral("PAMP"),   QStringLiteral("Pamphlet")); // = "booklet" ?
0107     s_typeMap->insert(QStringLiteral("PAT"),    QStringLiteral("Patent"));
0108     s_typeMap->insert(QStringLiteral("PCOMM"),  QStringLiteral("Personal communication"));
0109     s_typeMap->insert(QStringLiteral("RPRT"),   QStringLiteral("Report")); // = "techreport" ?
0110     s_typeMap->insert(QStringLiteral("SER"),    QStringLiteral("Serial (BookMonograph)"));
0111     s_typeMap->insert(QStringLiteral("SLIDE"),  QStringLiteral("Slide"));
0112     s_typeMap->insert(QStringLiteral("SOUND"),  QStringLiteral("Sound recording"));
0113     s_typeMap->insert(QStringLiteral("STAT"),   QStringLiteral("Statute"));
0114     s_typeMap->insert(QStringLiteral("THES"),   QStringLiteral("phdthesis")); // "mastersthesis" ?
0115     s_typeMap->insert(QStringLiteral("UNBILL"), QStringLiteral("Unenacted bill/resolution"));
0116     s_typeMap->insert(QStringLiteral("UNPB"),   QStringLiteral("unpublished")); // bibtex
0117     s_typeMap->insert(QStringLiteral("VIDEO"),  QStringLiteral("Video recording"));
0118   }
0119 }
0120 
0121 RISImporter::RISImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) {
0122   initTagMap();
0123   initTypeMap();
0124 }
0125 
0126 RISImporter::RISImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) {
0127   initTagMap();
0128   initTypeMap();
0129 }
0130 
0131 bool RISImporter::canImport(int type) const {
0132   return type == Data::Collection::Bibtex;
0133 }
0134 
0135 Tellico::Data::CollPtr RISImporter::collection() {
0136   if(m_coll) {
0137     return m_coll;
0138   }
0139 
0140   m_coll = new Data::BibtexCollection(true);
0141 
0142   QHash<QString, Data::FieldPtr> risFields;
0143 
0144   // need to know if any extended properties in current collection point to RIS
0145   // if so, add to collection
0146   Data::CollPtr currColl = currentCollection();
0147   if(currColl) {
0148     foreach(Data::FieldPtr field, currColl->fields()) {
0149       // continue if property is empty
0150       QString ris = field->property(QStringLiteral("ris"));
0151       if(ris.isEmpty()) {
0152         continue;
0153       }
0154       // if current collection has one with the same name, set the property
0155       Data::FieldPtr f = m_coll->fieldByName(field->name());
0156       if(!f) {
0157         f = new Data::Field(*field);
0158         m_coll->addField(f);
0159       }
0160       f->setProperty(QStringLiteral("ris"), ris);
0161       risFields.insert(ris, f);
0162     }
0163   }
0164   emit signalTotalSteps(this, urls().count() * 100);
0165 
0166   if(text().isEmpty()) {
0167     int count = 0;
0168     foreach(const QUrl& url, urls()) {
0169       if(m_cancelled)  {
0170         break;
0171       }
0172       readURL(url, count, risFields);
0173       ++count;
0174     }
0175   } else {
0176     readText(text(), 0, risFields);
0177   }
0178 
0179   if(m_cancelled) {
0180     m_coll = Data::CollPtr();
0181   }
0182   return m_coll;
0183 }
0184 
0185 void RISImporter::readURL(const QUrl& url_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) {
0186   QString str = FileHandler::readTextFile(url_);
0187   if(str.isEmpty()) {
0188     return;
0189   }
0190   readText(str, n, risFields_);
0191 }
0192 
0193 void RISImporter::readText(const QString& text_, int n, const QHash<QString, Tellico::Data::FieldPtr>& risFields_) {
0194   ISBNValidator isbnval(this);
0195 
0196   QString text = text_;
0197   QTextStream t(&text);
0198 
0199   const uint length = text.length();
0200   const uint stepSize = qMax(s_stepSize, length/100);
0201   const bool showProgress = options() & ImportProgress;
0202 
0203   bool needToAddFinal = false;
0204 
0205   QString sp, ep;
0206 
0207   uint j = 0;
0208   Data::EntryPtr entry(new Data::Entry(m_coll));
0209   // technically, the spec requires a space immediately after the hyphen
0210   // however, at least one website (Springer) outputs RIS with no space after the final "ER -"
0211   // so just strip the white space later
0212   // also be gracious and allow any amount of space before hyphen
0213   const QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$"));
0214   QString currLine, nextLine;
0215   for(currLine = t.readLine(); !m_cancelled && !t.atEnd(); currLine = nextLine, j += currLine.length()) {
0216     nextLine = t.readLine();
0217     QRegularExpressionMatch m = rx.match(currLine);
0218     QString tag = m.captured(1);
0219     QString value = m.captured(2).trimmed();
0220     if(tag.isEmpty()) {
0221       continue;
0222     }
0223 //    myDebug() << tag << ": " << value;
0224     // if the next line is not empty and does not match start regexp, append to value
0225     while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) {
0226       value += nextLine.trimmed();
0227       nextLine = t.readLine();
0228     }
0229 
0230     // every entry ends with "ER"
0231     if(tag == QLatin1String("ER")) {
0232       m_coll->addEntries(entry);
0233       entry = new Data::Entry(m_coll);
0234       needToAddFinal = false;
0235       continue;
0236     } else if(tag == QLatin1String("TY") && s_typeMap->contains(value)) {
0237       // for entry-type, switch it to normalized type name
0238       value = (*s_typeMap)[value];
0239     } else if(tag == QLatin1String("SN")) {
0240       // test for valid isbn, sometimes the issn gets stuck here
0241       int pos = 0;
0242       if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) {
0243         continue;
0244       }
0245     } else if(tag == QLatin1String("SP")) {
0246       sp = value;
0247       if(!ep.isEmpty()) {
0248         int startPage = sp.toInt();
0249         int endPage = ep.toInt();
0250         if(endPage > 0 && endPage < startPage) {
0251           myWarning() << "Assuming end page is really page count";
0252           ep = QString::number(startPage + endPage);
0253         }
0254         value = sp + QLatin1Char('-') + ep;
0255         tag = QStringLiteral("EP");
0256         sp.clear();
0257         ep.clear();
0258       } else {
0259         // nothing else to do
0260         continue;
0261       }
0262     } else if(tag == QLatin1String("EP")) {
0263       ep = value;
0264       if(!sp.isEmpty()) {
0265         int startPage = sp.toInt();
0266         int endPage = ep.toInt();
0267         if(endPage > 0 && endPage < startPage) {
0268           myWarning() << "Assuming end page is really page count";
0269           ep = QString::number(startPage + endPage);
0270         }
0271         value = sp + QLatin1Char('-') + ep;
0272         sp.clear();
0273         ep.clear();
0274       } else {
0275         continue;
0276       }
0277     } else if(tag == QLatin1String("YR") || tag == QLatin1String("PY")) {  // for now, just grab the year
0278       value = value.section(QLatin1Char('/'), 0, 0);
0279     }
0280 
0281     // the lookup scheme is:
0282     // 1. any field has an RIS property that matches the tag name
0283     // 2. default field mapping tag -> field name
0284     Data::FieldPtr f = risFields_[tag];
0285     if(!f) {
0286       // special case for BT
0287       // primary title for books, secondary for everything else
0288       if(tag == QLatin1String("BT")) {
0289         if(entry->field(QStringLiteral("entry-type")) == QLatin1String("book")) {
0290           f = m_coll->fieldByName(QStringLiteral("title"));
0291         } else {
0292           f = m_coll->fieldByName(QStringLiteral("booktitle"));
0293         }
0294       } else {
0295         f = fieldByTag(tag);
0296       }
0297     }
0298     if(!f) {
0299       continue;
0300     }
0301     needToAddFinal = true;
0302 
0303     // harmless for non-choice fields
0304     // for entry-type, want it in lower case
0305     f->addAllowed(value);
0306     // if the field can have multiple values, append current values to new value
0307     if(f->hasFlag(Data::Field::AllowMultiple) && !entry->field(f).isEmpty()) {
0308       value.prepend(entry->field(f) + FieldFormat::delimiterString());
0309     }
0310     entry->setField(f, value);
0311 
0312     if(showProgress && j%stepSize == 0) {
0313       emit signalProgress(this, n*100 + 100*j/length);
0314     }
0315   }
0316 
0317   if(needToAddFinal) {
0318     m_coll->addEntries(entry);
0319   }
0320 }
0321 
0322 Tellico::Data::FieldPtr RISImporter::fieldByTag(const QString& tag_) {
0323   Data::FieldPtr f;
0324   const QString& fieldTag = (*s_tagMap)[tag_];
0325   if(!fieldTag.isEmpty()) {
0326     f = m_coll->fieldByName(fieldTag);
0327     if(f) {
0328       f->setProperty(QStringLiteral("ris"), tag_);
0329       return f;
0330     }
0331   }
0332 
0333   // add non-default fields if not already there
0334   if(tag_== QLatin1String("L1")) {
0335     f = new Data::Field(QStringLiteral("pdf"), i18n("PDF"), Data::Field::URL);
0336     f->setProperty(QStringLiteral("ris"), QStringLiteral("L1"));
0337     f->setCategory(i18n("Miscellaneous"));
0338   }
0339   m_coll->addField(f);
0340   return f;
0341 }
0342 
0343 void RISImporter::slotCancel() {
0344   m_cancelled = true;
0345 }
0346 
0347 bool RISImporter::maybeRIS(const QUrl& url_) {
0348   QString text = FileHandler::readTextFile(url_, true /*quiet*/);
0349   if(text.isEmpty()) {
0350     return false;
0351   }
0352 
0353   // bare bones check, strip white space at beginning
0354   // and then first text line must be valid RIS
0355   QTextStream t(&text);
0356 
0357   QRegularExpression rx(QLatin1String("^(\\w\\w)\\s+-(.*)$"));
0358   QString currLine;
0359   for(currLine = t.readLine(); !t.atEnd(); currLine = t.readLine()) {
0360     if(currLine.trimmed().isEmpty()) {
0361       continue;
0362     }
0363     break;
0364   }
0365   return rx.match(currLine).hasMatch();
0366 }