File indexing completed on 2024-05-12 05:10:09

0001 /***************************************************************************
0002     Copyright (C) 2012 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include "ciwimporter.h"
0026 #include "../collections/bibtexcollection.h"
0027 #include "../entry.h"
0028 #include "../field.h"
0029 #include "../fieldformat.h"
0030 #include "../core/filehandler.h"
0031 #include "../utils/isbnvalidator.h"
0032 #include "../tellico_debug.h"
0033 
0034 #include <QRegularExpression>
0035 #include <QTextStream>
0036 #include <QApplication>
0037 
0038 using Tellico::Import::CIWImporter;
0039 QHash<QString, QString>* CIWImporter::s_tagMap = nullptr;
0040 
0041 // static
0042 void CIWImporter::initTagMap() {
0043   if(!s_tagMap) {
0044     s_tagMap = new QHash<QString, QString>();
0045     // BT is special and is handled separately
0046     s_tagMap->insert(QStringLiteral("PT"), QStringLiteral("entry-type"));
0047     s_tagMap->insert(QStringLiteral("TI"), QStringLiteral("title"));
0048     s_tagMap->insert(QStringLiteral("BT"), QStringLiteral("booktitle"));
0049     s_tagMap->insert(QStringLiteral("AU"), QStringLiteral("author"));
0050     s_tagMap->insert(QStringLiteral("AF"), QStringLiteral("author"));
0051     s_tagMap->insert(QStringLiteral("BE"), QStringLiteral("editor"));
0052     s_tagMap->insert(QStringLiteral("PY"), QStringLiteral("year"));
0053     s_tagMap->insert(QStringLiteral("AB"), QStringLiteral("abstract"));
0054     s_tagMap->insert(QStringLiteral("DE"), QStringLiteral("keyword"));
0055     s_tagMap->insert(QStringLiteral("SO"), QStringLiteral("journal"));
0056     s_tagMap->insert(QStringLiteral("SE"), QStringLiteral("journal"));
0057     s_tagMap->insert(QStringLiteral("VL"), QStringLiteral("volume"));
0058     s_tagMap->insert(QStringLiteral("IS"), QStringLiteral("number"));
0059     s_tagMap->insert(QStringLiteral("PU"), QStringLiteral("publisher"));
0060     s_tagMap->insert(QStringLiteral("BN"), QStringLiteral("isbn"));
0061     s_tagMap->insert(QStringLiteral("PA"), QStringLiteral("address"));
0062     s_tagMap->insert(QStringLiteral("DI"), QStringLiteral("doi"));
0063     s_tagMap->insert(QStringLiteral("EP"), QStringLiteral("pages"));
0064   }
0065 }
0066 
0067 CIWImporter::CIWImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) {
0068   initTagMap();
0069 }
0070 
0071 CIWImporter::CIWImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) {
0072   initTagMap();
0073 }
0074 
0075 bool CIWImporter::canImport(int type) const {
0076   return type == Data::Collection::Bibtex;
0077 }
0078 
0079 Tellico::Data::CollPtr CIWImporter::collection() {
0080   if(m_coll) {
0081     return m_coll;
0082   }
0083 
0084   m_coll = new Data::BibtexCollection(true);
0085 
0086   emit signalTotalSteps(this, urls().count() * 100);
0087 
0088   if(text().isEmpty()) {
0089     int count = 0;
0090     foreach(const QUrl& url, urls()) {
0091       if(m_cancelled)  {
0092         break;
0093       }
0094       readURL(url, count);
0095       ++count;
0096     }
0097   } else {
0098     readText(text(), 0);
0099   }
0100 
0101   if(m_cancelled) {
0102     m_coll = Data::CollPtr();
0103   }
0104   return m_coll;
0105 }
0106 
0107 void CIWImporter::readURL(const QUrl& url_, int n) {
0108   QString str = FileHandler::readTextFile(url_);
0109   if(str.isEmpty()) {
0110     return;
0111   }
0112   readText(str, n);
0113 }
0114 
0115 void CIWImporter::readText(const QString& text_, int n) {
0116   ISBNValidator isbnval(this);
0117 
0118   QString text = text_;
0119   QTextStream t(&text);
0120 
0121   const uint length = text.length();
0122   const uint stepSize = qMax(s_stepSize, length/100);
0123   const bool showProgress = options() & ImportProgress;
0124 
0125   bool needToAddFinal = false;
0126   bool usebooktitle = false;
0127 
0128   QString sp, ep;
0129 
0130   uint j = 0;
0131   Data::EntryPtr entry(new Data::Entry(m_coll));
0132   // no idea what the "formal" format is, take it as two characters, followed by a space and then value
0133   // the entry ends with just ER
0134   static const QRegularExpression rx(QLatin1String("^(\\w\\w) ?(.*)$"));
0135   QString currLine, nextLine;
0136   for(currLine = t.readLine(); !m_cancelled && !t.atEnd(); currLine = nextLine, j += currLine.length()) {
0137     nextLine = t.readLine();
0138     QRegularExpressionMatch m = rx.match(currLine);
0139     QString tag = m.captured(1);
0140     QString value = m.captured(2).trimmed();
0141     if(tag.isEmpty()) {
0142       continue;
0143     }
0144 //    myDebug() << tag << ": " << value;
0145     // if the next line is not empty and does not match start regexp, append to value
0146     while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) {
0147       // authors and editors get the value separator
0148       if(tag == QLatin1String("AU") || tag == QLatin1String("AF") || tag == QLatin1String("BE")) {
0149         value += FieldFormat::delimiterString();
0150       } else {
0151         value += QLatin1String(" ");
0152       }
0153       value += nextLine.trimmed();
0154       nextLine = t.readLine();
0155     }
0156 
0157     // every entry ends with "ER"
0158     if(tag == QLatin1String("ER")) {
0159       m_coll->addEntries(entry);
0160       entry = new Data::Entry(m_coll);
0161       needToAddFinal = false;
0162       continue;
0163     } else if(tag == QLatin1String("PT")) {
0164       // but the S means that SO is the book title instead of journal name
0165       if(value == QLatin1String("S")) {
0166         usebooktitle = true;
0167       }
0168       // assume everything is article
0169       value = QStringLiteral("article");
0170     } else if(tag == QLatin1String("BN")) {
0171       // test for valid isbn
0172       int pos = 0;
0173       if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) {
0174         continue;
0175       }
0176     } else if(tag == QLatin1String("SO")) {
0177       if(usebooktitle) {
0178         tag = QStringLiteral("BT");
0179       }
0180     } else if(tag == QLatin1String("BP")) {
0181       sp = value;
0182       if(!ep.isEmpty()) {
0183         int startPage = sp.toInt();
0184         int endPage = ep.toInt();
0185         if(endPage > 0 && endPage < startPage) {
0186           myWarning() << "Assuming end page is really page count";
0187           ep = QString::number(startPage + endPage);
0188         }
0189         value = sp + QLatin1Char('-') + ep;
0190         tag = QStringLiteral("EP");
0191         sp.clear();
0192         ep.clear();
0193       } else {
0194         // nothing else to do
0195         continue;
0196       }
0197     } else if(tag == QLatin1String("EP")) {
0198       ep = value;
0199       if(!sp.isEmpty()) {
0200         int startPage = sp.toInt();
0201         int endPage = ep.toInt();
0202         if(endPage > 0 && endPage < startPage) {
0203           myWarning() << "Assuming end page is really page count";
0204           ep = QString::number(startPage + endPage);
0205         }
0206         value = sp + QLatin1Char('-') + ep;
0207         sp.clear();
0208         ep.clear();
0209       } else {
0210         continue;
0211       }
0212     }
0213 
0214     Data::FieldPtr f = fieldByTag(tag);
0215     if(!f) {
0216       continue;
0217     }
0218     needToAddFinal = true;
0219 
0220     // harmless for non-choice fields
0221     // for entry-type, want it in lower case
0222     f->addAllowed(value);
0223     entry->setField(f, value);
0224 
0225     if(showProgress && j%stepSize == 0) {
0226       emit signalProgress(this, n*100 + 100*j/length);
0227       qApp->processEvents();
0228     }
0229   }
0230 
0231   if(needToAddFinal) {
0232     m_coll->addEntries(entry);
0233   }
0234 }
0235 
0236 Tellico::Data::FieldPtr CIWImporter::fieldByTag(const QString& tag_) {
0237   const QString& fieldTag = (*s_tagMap)[tag_];
0238   if(fieldTag.isEmpty()) {
0239     return Data::FieldPtr();
0240   }
0241   return m_coll->fieldByName(fieldTag);
0242 }
0243 
0244 void CIWImporter::slotCancel() {
0245   m_cancelled = true;
0246 }
0247 
0248 bool CIWImporter::maybeCIW(const QUrl& url_) {
0249   QString text = FileHandler::readTextFile(url_, true /*quiet*/);
0250   if(text.isEmpty()) {
0251     return false;
0252   }
0253 
0254   // bare bones check, strip white space at beginning
0255   // and then first text line must be valid CIW, i.e. two letters followed by a space
0256   QTextStream t(&text);
0257 
0258   static const QRegularExpression rx(QLatin1String("^(\\w\\w) \\w(.*)$"));
0259   QString currLine;
0260   for(currLine = t.readLine(); !t.atEnd(); currLine = t.readLine()) {
0261     if(currLine.trimmed().isEmpty()) {
0262       continue;
0263     }
0264     break;
0265   }
0266   return rx.match(currLine).hasMatch();
0267 }