File indexing completed on 2024-05-12 05:10:09
0001 /*************************************************************************** 0002 Copyright (C) 2012 Robby Stephenson <robby@periapsis.org> 0003 ***************************************************************************/ 0004 0005 /*************************************************************************** 0006 * * 0007 * This program is free software; you can redistribute it and/or * 0008 * modify it under the terms of the GNU General Public License as * 0009 * published by the Free Software Foundation; either version 2 of * 0010 * the License or (at your option) version 3 or any later version * 0011 * accepted by the membership of KDE e.V. (or its successor approved * 0012 * by the membership of KDE e.V.), which shall act as a proxy * 0013 * defined in Section 14 of version 3 of the license. * 0014 * * 0015 * This program is distributed in the hope that it will be useful, * 0016 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0018 * GNU General Public License for more details. * 0019 * * 0020 * You should have received a copy of the GNU General Public License * 0021 * along with this program. If not, see <http://www.gnu.org/licenses/>. * 0022 * * 0023 ***************************************************************************/ 0024 0025 #include "ciwimporter.h" 0026 #include "../collections/bibtexcollection.h" 0027 #include "../entry.h" 0028 #include "../field.h" 0029 #include "../fieldformat.h" 0030 #include "../core/filehandler.h" 0031 #include "../utils/isbnvalidator.h" 0032 #include "../tellico_debug.h" 0033 0034 #include <QRegularExpression> 0035 #include <QTextStream> 0036 #include <QApplication> 0037 0038 using Tellico::Import::CIWImporter; 0039 QHash<QString, QString>* CIWImporter::s_tagMap = nullptr; 0040 0041 // static 0042 void CIWImporter::initTagMap() { 0043 if(!s_tagMap) { 0044 s_tagMap = new QHash<QString, QString>(); 0045 // BT is special and is handled separately 0046 s_tagMap->insert(QStringLiteral("PT"), QStringLiteral("entry-type")); 0047 s_tagMap->insert(QStringLiteral("TI"), QStringLiteral("title")); 0048 s_tagMap->insert(QStringLiteral("BT"), QStringLiteral("booktitle")); 0049 s_tagMap->insert(QStringLiteral("AU"), QStringLiteral("author")); 0050 s_tagMap->insert(QStringLiteral("AF"), QStringLiteral("author")); 0051 s_tagMap->insert(QStringLiteral("BE"), QStringLiteral("editor")); 0052 s_tagMap->insert(QStringLiteral("PY"), QStringLiteral("year")); 0053 s_tagMap->insert(QStringLiteral("AB"), QStringLiteral("abstract")); 0054 s_tagMap->insert(QStringLiteral("DE"), QStringLiteral("keyword")); 0055 s_tagMap->insert(QStringLiteral("SO"), QStringLiteral("journal")); 0056 s_tagMap->insert(QStringLiteral("SE"), QStringLiteral("journal")); 0057 s_tagMap->insert(QStringLiteral("VL"), QStringLiteral("volume")); 0058 s_tagMap->insert(QStringLiteral("IS"), QStringLiteral("number")); 0059 s_tagMap->insert(QStringLiteral("PU"), QStringLiteral("publisher")); 0060 s_tagMap->insert(QStringLiteral("BN"), QStringLiteral("isbn")); 0061 s_tagMap->insert(QStringLiteral("PA"), QStringLiteral("address")); 0062 s_tagMap->insert(QStringLiteral("DI"), QStringLiteral("doi")); 0063 s_tagMap->insert(QStringLiteral("EP"), QStringLiteral("pages")); 0064 } 0065 } 0066 0067 CIWImporter::CIWImporter(const QList<QUrl>& urls_) : Tellico::Import::Importer(urls_), m_coll(nullptr), m_cancelled(false) { 0068 initTagMap(); 0069 } 0070 0071 CIWImporter::CIWImporter(const QString& text_) : Tellico::Import::Importer(text_), m_coll(nullptr), m_cancelled(false) { 0072 initTagMap(); 0073 } 0074 0075 bool CIWImporter::canImport(int type) const { 0076 return type == Data::Collection::Bibtex; 0077 } 0078 0079 Tellico::Data::CollPtr CIWImporter::collection() { 0080 if(m_coll) { 0081 return m_coll; 0082 } 0083 0084 m_coll = new Data::BibtexCollection(true); 0085 0086 emit signalTotalSteps(this, urls().count() * 100); 0087 0088 if(text().isEmpty()) { 0089 int count = 0; 0090 foreach(const QUrl& url, urls()) { 0091 if(m_cancelled) { 0092 break; 0093 } 0094 readURL(url, count); 0095 ++count; 0096 } 0097 } else { 0098 readText(text(), 0); 0099 } 0100 0101 if(m_cancelled) { 0102 m_coll = Data::CollPtr(); 0103 } 0104 return m_coll; 0105 } 0106 0107 void CIWImporter::readURL(const QUrl& url_, int n) { 0108 QString str = FileHandler::readTextFile(url_); 0109 if(str.isEmpty()) { 0110 return; 0111 } 0112 readText(str, n); 0113 } 0114 0115 void CIWImporter::readText(const QString& text_, int n) { 0116 ISBNValidator isbnval(this); 0117 0118 QString text = text_; 0119 QTextStream t(&text); 0120 0121 const uint length = text.length(); 0122 const uint stepSize = qMax(s_stepSize, length/100); 0123 const bool showProgress = options() & ImportProgress; 0124 0125 bool needToAddFinal = false; 0126 bool usebooktitle = false; 0127 0128 QString sp, ep; 0129 0130 uint j = 0; 0131 Data::EntryPtr entry(new Data::Entry(m_coll)); 0132 // no idea what the "formal" format is, take it as two characters, followed by a space and then value 0133 // the entry ends with just ER 0134 static const QRegularExpression rx(QLatin1String("^(\\w\\w) ?(.*)$")); 0135 QString currLine, nextLine; 0136 for(currLine = t.readLine(); !m_cancelled && !t.atEnd(); currLine = nextLine, j += currLine.length()) { 0137 nextLine = t.readLine(); 0138 QRegularExpressionMatch m = rx.match(currLine); 0139 QString tag = m.captured(1); 0140 QString value = m.captured(2).trimmed(); 0141 if(tag.isEmpty()) { 0142 continue; 0143 } 0144 // myDebug() << tag << ": " << value; 0145 // if the next line is not empty and does not match start regexp, append to value 0146 while(!nextLine.isEmpty() && !rx.match(nextLine).hasMatch()) { 0147 // authors and editors get the value separator 0148 if(tag == QLatin1String("AU") || tag == QLatin1String("AF") || tag == QLatin1String("BE")) { 0149 value += FieldFormat::delimiterString(); 0150 } else { 0151 value += QLatin1String(" "); 0152 } 0153 value += nextLine.trimmed(); 0154 nextLine = t.readLine(); 0155 } 0156 0157 // every entry ends with "ER" 0158 if(tag == QLatin1String("ER")) { 0159 m_coll->addEntries(entry); 0160 entry = new Data::Entry(m_coll); 0161 needToAddFinal = false; 0162 continue; 0163 } else if(tag == QLatin1String("PT")) { 0164 // but the S means that SO is the book title instead of journal name 0165 if(value == QLatin1String("S")) { 0166 usebooktitle = true; 0167 } 0168 // assume everything is article 0169 value = QStringLiteral("article"); 0170 } else if(tag == QLatin1String("BN")) { 0171 // test for valid isbn 0172 int pos = 0; 0173 if(isbnval.validate(value, pos) != ISBNValidator::Acceptable) { 0174 continue; 0175 } 0176 } else if(tag == QLatin1String("SO")) { 0177 if(usebooktitle) { 0178 tag = QStringLiteral("BT"); 0179 } 0180 } else if(tag == QLatin1String("BP")) { 0181 sp = value; 0182 if(!ep.isEmpty()) { 0183 int startPage = sp.toInt(); 0184 int endPage = ep.toInt(); 0185 if(endPage > 0 && endPage < startPage) { 0186 myWarning() << "Assuming end page is really page count"; 0187 ep = QString::number(startPage + endPage); 0188 } 0189 value = sp + QLatin1Char('-') + ep; 0190 tag = QStringLiteral("EP"); 0191 sp.clear(); 0192 ep.clear(); 0193 } else { 0194 // nothing else to do 0195 continue; 0196 } 0197 } else if(tag == QLatin1String("EP")) { 0198 ep = value; 0199 if(!sp.isEmpty()) { 0200 int startPage = sp.toInt(); 0201 int endPage = ep.toInt(); 0202 if(endPage > 0 && endPage < startPage) { 0203 myWarning() << "Assuming end page is really page count"; 0204 ep = QString::number(startPage + endPage); 0205 } 0206 value = sp + QLatin1Char('-') + ep; 0207 sp.clear(); 0208 ep.clear(); 0209 } else { 0210 continue; 0211 } 0212 } 0213 0214 Data::FieldPtr f = fieldByTag(tag); 0215 if(!f) { 0216 continue; 0217 } 0218 needToAddFinal = true; 0219 0220 // harmless for non-choice fields 0221 // for entry-type, want it in lower case 0222 f->addAllowed(value); 0223 entry->setField(f, value); 0224 0225 if(showProgress && j%stepSize == 0) { 0226 emit signalProgress(this, n*100 + 100*j/length); 0227 qApp->processEvents(); 0228 } 0229 } 0230 0231 if(needToAddFinal) { 0232 m_coll->addEntries(entry); 0233 } 0234 } 0235 0236 Tellico::Data::FieldPtr CIWImporter::fieldByTag(const QString& tag_) { 0237 const QString& fieldTag = (*s_tagMap)[tag_]; 0238 if(fieldTag.isEmpty()) { 0239 return Data::FieldPtr(); 0240 } 0241 return m_coll->fieldByName(fieldTag); 0242 } 0243 0244 void CIWImporter::slotCancel() { 0245 m_cancelled = true; 0246 } 0247 0248 bool CIWImporter::maybeCIW(const QUrl& url_) { 0249 QString text = FileHandler::readTextFile(url_, true /*quiet*/); 0250 if(text.isEmpty()) { 0251 return false; 0252 } 0253 0254 // bare bones check, strip white space at beginning 0255 // and then first text line must be valid CIW, i.e. two letters followed by a space 0256 QTextStream t(&text); 0257 0258 static const QRegularExpression rx(QLatin1String("^(\\w\\w) \\w(.*)$")); 0259 QString currLine; 0260 for(currLine = t.readLine(); !t.atEnd(); currLine = t.readLine()) { 0261 if(currLine.trimmed().isEmpty()) { 0262 continue; 0263 } 0264 break; 0265 } 0266 return rx.match(currLine).hasMatch(); 0267 }