File indexing completed on 2024-05-19 05:05:37

0001 /***************************************************************************
0002  *   SPDX-License-Identifier: GPL-2.0-or-later
0003  *                                                                         *
0004  *   SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005  *                                                                         *
0006  *   This program is free software; you can redistribute it and/or modify  *
0007  *   it under the terms of the GNU General Public License as published by  *
0008  *   the Free Software Foundation; either version 2 of the License, or     *
0009  *   (at your option) any later version.                                   *
0010  *                                                                         *
0011  *   This program is distributed in the hope that it will be useful,       *
0012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0014  *   GNU General Public License for more details.                          *
0015  *                                                                         *
0016  *   You should have received a copy of the GNU General Public License     *
0017  *   along with this program; if not, see <https://www.gnu.org/licenses/>. *
0018  ***************************************************************************/
0019 
0020 #include "fileimporterris.h"
0021 
0022 #include <QIODevice>
0023 #include <QVector>
0024 #include <QTextStream>
0025 #include <QRegularExpression>
0026 #include <QCoreApplication>
0027 #include <QStringList>
0028 
0029 #include <Preferences>
0030 #include <KBibTeX>
0031 #include <Entry>
0032 #include <Value>
0033 #include "fileexporter.h"
0034 #include "fileimporter_p.h"
0035 #include "logging_io.h"
0036 
0037 #define appendValue(entry, fieldname, newvalue) { Value value = (entry)->value((fieldname)); value.append((newvalue)); (entry)->insert((fieldname), value); }
0038 #define removeDuplicates(entry, fieldname) { Value value = (entry)->value((fieldname)); if (!(value).isEmpty()) removeDuplicateValueItems((value)); if (!(value).isEmpty()) (entry)->insert((fieldname), value); }
0039 
0040 class FileImporterRIS::FileImporterRISPrivate
0041 {
0042 public:
0043     FileImporterRIS *parent;
0044     int referenceCounter;
0045     bool cancelFlag;
0046     bool protectCasing;
0047 
0048     typedef struct {
0049         QString key;
0050         QString value;
0051     }
0052     RISitem;
0053     typedef QVector<RISitem> RISitemList;
0054 
0055     FileImporterRISPrivate(FileImporterRIS *_parent)
0056             : parent(_parent), referenceCounter(0), cancelFlag(false), protectCasing(false) {
0057         /// nothing
0058     }
0059 
0060     RISitemList readElement(QTextStream &textStream) {
0061         RISitemList result;
0062         QString line = textStream.readLine();
0063         while (!line.startsWith(QStringLiteral("TY  - ")) && !textStream.atEnd())
0064             line = textStream.readLine();
0065         if (textStream.atEnd())
0066             return result;
0067 
0068         QString key, value;
0069         while (!line.startsWith(QStringLiteral("ER  -")) && !textStream.atEnd()) {
0070             if (line.mid(2, 3) == QStringLiteral("  -")) {
0071                 if (!value.isEmpty()) {
0072                     RISitem item;
0073                     item.key = key;
0074                     item.value = value;
0075                     result.append(item);
0076                 }
0077 
0078                 key = line.left(2);
0079                 value = line.mid(6).simplified();
0080             } else {
0081                 line = line.simplified();
0082                 if (line.length() > 1) {
0083                     /// multi-line field are joined to one long line
0084                     value += QLatin1Char(' ') + line;
0085                 }
0086             }
0087 
0088             line = textStream.readLine();
0089         }
0090         if (!line.startsWith(QStringLiteral("ER  -")) && textStream.atEnd()) {
0091             qCWarning(LOG_KBIBTEX_IO) << "Expected that entry that starts with 'TY' ends with 'ER' but instead met end of file";
0092             /// Instead of an 'emit' ...
0093 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0094             QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QStringLiteral("Expected that entry that starts with 'TY' ends with 'ER' but instead met end of file")));
0095 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0096             QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QStringLiteral("Expected that entry that starts with 'TY' ends with 'ER' but instead met end of file")));
0097 #endif
0098         }
0099         if (!value.isEmpty()) {
0100             RISitem item;
0101             item.key = key;
0102             item.value = value;
0103             result.append(item);
0104         }
0105 
0106         return result;
0107     }
0108 
0109     inline QString optionallyProtectCasing(const QString &text) const {
0110         if (protectCasing)
0111             return QLatin1Char('{') + text + QLatin1Char('}');
0112         else
0113             return text;
0114     }
0115 
0116     Element *nextElement(QTextStream &textStream) {
0117         RISitemList list = readElement(textStream);
0118         if (list.empty())
0119             return nullptr;
0120 
0121         QString entryType = Entry::etMisc;
0122         Entry *entry = new Entry(entryType, QString(QStringLiteral("RIS_%1")).arg(referenceCounter++));
0123         QString journalName, startPage, endPage, date;
0124         int fieldCounter = 0;
0125 
0126         for (RISitemList::iterator it = list.begin(); it != list.end(); ++it) {
0127             if ((*it).key == QStringLiteral("TY")) {
0128                 if ((*it).value.startsWith(QStringLiteral("BOOK")) || (*it).value.startsWith(QStringLiteral("SER")))
0129                     entryType = Entry::etBook;
0130                 else if ((*it).value.startsWith(QStringLiteral("CHAP")))
0131                     entryType = Entry::etInBook;
0132                 else if ((*it).value.startsWith(QStringLiteral("CONF")))
0133                     entryType = Entry::etInProceedings;
0134                 else if ((*it).value.startsWith(QStringLiteral("JFULL")) || (*it).value.startsWith(QStringLiteral("JOUR")) || (*it).value.startsWith(QStringLiteral("MGZN")))
0135                     entryType = Entry::etArticle;
0136                 else if ((*it).value.startsWith(QStringLiteral("RPRT")))
0137                     entryType = Entry::etTechReport;
0138                 else if ((*it).value.startsWith(QStringLiteral("THES")))
0139                     entryType = Entry::etPhDThesis; // FIXME what about etMastersThesis?
0140                 else if ((*it).value.startsWith(QStringLiteral("UNPB")))
0141                     entryType = Entry::etUnpublished;
0142                 entry->setType(entryType);
0143             } else if ((*it).key == QStringLiteral("AU") || (*it).key == QStringLiteral("A1")) {
0144                 Person *person = splitName((*it).value);
0145                 if (person != nullptr)
0146                     appendValue(entry, Entry::ftAuthor, QSharedPointer<Person>(person));
0147             } else if ((*it).key == QStringLiteral("ED") || (*it).key == QStringLiteral("A2")) {
0148                 Person *person = splitName((*it).value);
0149                 if (person != nullptr)
0150                     appendValue(entry, Entry::ftEditor, QSharedPointer<Person>(person));
0151             } else if ((*it).key == QStringLiteral("ID")) {
0152                 entry->setId((*it).value);
0153             } else if ((*it).key == QStringLiteral("Y1") || (*it).key == QStringLiteral("PY")) {
0154                 date = (*it).value;
0155             } else if ((*it).key == QStringLiteral("Y2")) {
0156                 if (date.isEmpty())
0157                     date = (*it).value;
0158             } else if ((*it).key == QStringLiteral("AB") || (*it).key == QStringLiteral("N2")) {
0159                 appendValue(entry, Entry::ftAbstract, QSharedPointer<PlainText>(new PlainText((*it).value)));
0160             } else if ((*it).key == QStringLiteral("N1")) {
0161                 appendValue(entry, Entry::ftNote, QSharedPointer<PlainText>(new PlainText((*it).value)));
0162             } else if ((*it).key == QStringLiteral("KW")) {
0163                 QString text = (*it).value;
0164                 const QRegularExpression splitRegExp(text.contains(QStringLiteral(";")) ? QStringLiteral("\\s*[;\\n]\\s*") : (text.contains(QStringLiteral(",")) ? QStringLiteral("\\s*[,\\n]\\s*") : QStringLiteral("\\n")));
0165 #if QT_VERSION >= 0x050e00
0166                 QStringList newKeywords = text.split(splitRegExp, Qt::SkipEmptyParts);
0167 #else // QT_VERSION < 0x050e00
0168                 QStringList newKeywords = text.split(splitRegExp, QString::SkipEmptyParts);
0169 #endif // QT_VERSION >= 0x050e00
0170                 for (QStringList::Iterator it = newKeywords.begin(); it != newKeywords.end(); ++it)
0171                     appendValue(entry, Entry::ftKeywords, QSharedPointer<Keyword>(new Keyword(*it)));
0172             } else if ((*it).key == QStringLiteral("TI") || (*it).key == QStringLiteral("T1")) {
0173                 appendValue(entry, Entry::ftTitle, QSharedPointer<PlainText>(new PlainText(optionallyProtectCasing((*it).value))));
0174             } else if ((*it).key == QStringLiteral("T3")) {
0175                 appendValue(entry, Entry::ftSeries, QSharedPointer<PlainText>(new PlainText((*it).value)));
0176             } else if ((*it).key == QStringLiteral("JO") || (*it).key == QStringLiteral("J1") || (*it).key == QStringLiteral("J2")) {
0177                 if (journalName.isEmpty())
0178                     journalName = (*it).value;
0179             } else if ((*it).key == QStringLiteral("JF") || (*it).key == QStringLiteral("JA")) {
0180                 journalName = (*it).value;
0181             } else if ((*it).key == QStringLiteral("VL")) {
0182                 appendValue(entry, Entry::ftVolume, QSharedPointer<PlainText>(new PlainText((*it).value)));
0183             } else if ((*it).key == QStringLiteral("CP")) {
0184                 appendValue(entry, Entry::ftChapter, QSharedPointer<PlainText>(new PlainText((*it).value)));
0185             } else if ((*it).key == QStringLiteral("IS")) {
0186                 appendValue(entry, Entry::ftNumber, QSharedPointer<PlainText>(new PlainText((*it).value)));
0187             } else if ((*it).key == QStringLiteral("DO") || (*it).key == QStringLiteral("M3")) {
0188                 const QRegularExpressionMatch doiRegExpMatch = KBibTeX::doiRegExp.match((*it).value);
0189                 if (doiRegExpMatch.hasMatch())
0190                     appendValue(entry, Entry::ftDOI, QSharedPointer<VerbatimText>(new VerbatimText(doiRegExpMatch.captured(QStringLiteral("doi")))));
0191             } else if ((*it).key == QStringLiteral("PB")) {
0192                 appendValue(entry, Entry::ftPublisher, QSharedPointer<PlainText>(new PlainText((*it).value)));
0193             } else if ((*it).key == QStringLiteral("IN")) {
0194                 appendValue(entry, Entry::ftSchool, QSharedPointer<PlainText>(new PlainText((*it).value)));
0195             } else if ((*it).key == QStringLiteral("SN")) {
0196                 const QString fieldName = entryType == Entry::etBook || entryType == Entry::etInBook ? Entry::ftISBN : Entry::ftISSN;
0197                 appendValue(entry, fieldName, QSharedPointer<PlainText>(new PlainText((*it).value)));
0198             } else if ((*it).key == QStringLiteral("CY")) {
0199                 appendValue(entry, Entry::ftLocation, QSharedPointer<PlainText>(new PlainText((*it).value)));
0200             }  else if ((*it).key == QStringLiteral("AD")) {
0201                 appendValue(entry, Entry::ftAddress, QSharedPointer<PlainText>(new PlainText((*it).value)));
0202             } else if ((*it).key == QStringLiteral("L1") || (*it).key == QStringLiteral("L2") || (*it).key == QStringLiteral("L3") || (*it).key == QStringLiteral("UR")) {
0203                 QString fieldValue = (*it).value;
0204                 fieldValue.replace(QStringLiteral("<Go to ISI>://"), QStringLiteral("isi://"));
0205                 const QRegularExpressionMatch doiRegExpMatch = KBibTeX::doiRegExp.match(fieldValue);
0206                 const QRegularExpressionMatch urlRegExpMatch = KBibTeX::urlRegExp.match(fieldValue);
0207                 const QString fieldName = doiRegExpMatch.hasMatch() ? Entry::ftDOI : (KBibTeX::urlRegExp.match((*it).value).hasMatch() ? Entry::ftUrl : (Preferences::instance().bibliographySystem() == Preferences::BibliographySystem::BibTeX ? Entry::ftLocalFile : Entry::ftFile));
0208                 fieldValue = doiRegExpMatch.hasMatch() ? doiRegExpMatch.captured(QStringLiteral("doi")) : (urlRegExpMatch.hasMatch() ? urlRegExpMatch.captured() : fieldValue);
0209                 if (fieldValue.startsWith(QStringLiteral("file:///"))) fieldValue = fieldValue.mid(7);
0210                 appendValue(entry, fieldName, QSharedPointer<VerbatimText>(new VerbatimText(fieldValue)));
0211             } else if ((*it).key == QStringLiteral("SP")) {
0212                 startPage = (*it).value;
0213             } else if ((*it).key == QStringLiteral("EP")) {
0214                 endPage = (*it).value;
0215             } else {
0216                 const QString fieldName = QString(QStringLiteral("RISfield_%1_%2")).arg(fieldCounter++).arg((*it).key.left(2));
0217                 appendValue(entry, fieldName, QSharedPointer<PlainText>(new PlainText((*it).value)));
0218             }
0219         }
0220 
0221         if (!journalName.isEmpty()) {
0222             const QString fieldName = entryType == Entry::etInBook || entryType == Entry::etInProceedings ? Entry::ftBookTitle : Entry::ftJournal;
0223             Value value = entry->value(fieldName);
0224             value.append(QSharedPointer<PlainText>(new PlainText(optionallyProtectCasing(journalName))));
0225             entry->insert(fieldName, value);
0226         }
0227 
0228         if (!startPage.isEmpty() || !endPage.isEmpty()) {
0229             QString page;
0230             if (startPage.isEmpty())
0231                 page = endPage;
0232             else if (endPage.isEmpty())
0233                 page = startPage;
0234             else
0235                 page = startPage + QChar(0x2013) + endPage;
0236 
0237             Value value;
0238             value.append(QSharedPointer<PlainText>(new PlainText(page)));
0239             entry->insert(Entry::ftPages, value);
0240         }
0241 
0242 #if QT_VERSION >= 0x050e00
0243         QStringList dateFragments = date.split(QStringLiteral("/"), Qt::SkipEmptyParts);
0244 #else // QT_VERSION < 0x050e00
0245         QStringList dateFragments = date.split(QStringLiteral("/"), QString::SkipEmptyParts);
0246 #endif // QT_VERSION >= 0x050e00
0247         if (dateFragments.count() > 0) {
0248             bool ok;
0249             int year = dateFragments[0].toInt(&ok);
0250             if (ok && year > 1000 && year < 3000) {
0251                 Value value = entry->value(Entry::ftYear);
0252                 value.append(QSharedPointer<PlainText>(new PlainText(QString::number(year))));
0253                 entry->insert(Entry::ftYear, value);
0254             } else {
0255                 qCWarning(LOG_KBIBTEX_IO) << "Invalid year: " << dateFragments[0];
0256                 /// Instead of an 'emit' ...
0257 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0258                 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Invalid year: '%1'")).arg(dateFragments[0])));
0259 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0260                 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Invalid year: '%1'")).arg(dateFragments[0])));
0261 #endif
0262             }
0263         }
0264         if (dateFragments.count() > 1) {
0265             bool ok;
0266             int month = FileExporter::monthStringToNumber(dateFragments[1], &ok);
0267             if (ok && month >= 1 && month <= 12) {
0268                 Value value = entry->value(Entry::ftMonth);
0269                 value.append(QSharedPointer<MacroKey>(new MacroKey(KBibTeX::MonthsTriple[month - 1])));
0270                 entry->insert(Entry::ftMonth, value);
0271             } else {
0272                 qCWarning(LOG_KBIBTEX_IO) << "Invalid month: " << dateFragments[1];
0273                 /// Instead of an 'emit' ...
0274 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0)
0275                 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Invalid month: '%1'")).arg(dateFragments[1])));
0276 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0)
0277                 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Invalid month: '%1'")).arg(dateFragments[1])));
0278 #endif
0279             }
0280         }
0281 
0282         removeDuplicates(entry, Entry::ftDOI);
0283         removeDuplicates(entry, Entry::ftUrl);
0284 
0285         return entry;
0286     }
0287 
0288     void removeDuplicateValueItems(Value &value) {
0289         if (value.count() < 2) return; /// Values with one or no ValueItem cannot have duplicates
0290 
0291         QSet<QString> uniqueStrings;
0292         for (Value::Iterator it = value.begin(); it != value.end();) {
0293             const QString itemString = PlainTextValue::text(*it);
0294             if (uniqueStrings.contains(itemString))
0295                 it = value.erase(it);
0296             else {
0297                 uniqueStrings.insert(itemString);
0298                 ++it;
0299             }
0300         }
0301     }
0302 };
0303 
0304 FileImporterRIS::FileImporterRIS(QObject *parent)
0305         : FileImporter(parent), d(new FileImporterRISPrivate(this))
0306 {
0307 // nothing
0308 }
0309 
0310 
0311 FileImporterRIS::~FileImporterRIS()
0312 {
0313     delete d;
0314 }
0315 
0316 File *FileImporterRIS::load(QIODevice *iodevice)
0317 {
0318     check_if_iodevice_invalid(iodevice);
0319 
0320     d->cancelFlag = false;
0321     d->referenceCounter = 0;
0322     QTextStream textStream(iodevice);
0323 
0324     File *result = new File();
0325     while (!d->cancelFlag && !textStream.atEnd()) {
0326         Q_EMIT progress(textStream.pos(), iodevice->size());
0327         QCoreApplication::instance()->processEvents();
0328         Element *element = d->nextElement(textStream);
0329         if (element != nullptr)
0330             result->append(QSharedPointer<Element>(element));
0331         QCoreApplication::instance()->processEvents();
0332     }
0333     Q_EMIT progress(100, 100);
0334 
0335     if (d->cancelFlag) {
0336         delete result;
0337         result = nullptr;
0338     }
0339 
0340     iodevice->close();
0341 
0342     if (result != nullptr)
0343         result->setProperty(File::ProtectCasing, static_cast<int>(d->protectCasing ? Qt::Checked : Qt::Unchecked));
0344 
0345     return result;
0346 }
0347 
0348 bool FileImporterRIS::guessCanDecode(const QString &text)
0349 {
0350     return text.indexOf(QStringLiteral("TY  - ")) >= 0;
0351 }
0352 
0353 void FileImporterRIS::setProtectCasing(bool protectCasing)
0354 {
0355     d->protectCasing = protectCasing;
0356 }
0357 
0358 void FileImporterRIS::cancel()
0359 {
0360     d->cancelFlag = true;
0361 }