File indexing completed on 2024-05-12 05:10:08

0001 /***************************************************************************
0002     Copyright (C) 2003-2009 Robby Stephenson <robby@periapsis.org>
0003  ***************************************************************************/
0004 
0005 /***************************************************************************
0006  *                                                                         *
0007  *   This program is free software; you can redistribute it and/or         *
0008  *   modify it under the terms of the GNU General Public License as        *
0009  *   published by the Free Software Foundation; either version 2 of        *
0010  *   the License or (at your option) version 3 or any later version        *
0011  *   accepted by the membership of KDE e.V. (or its successor approved     *
0012  *   by the membership of KDE e.V.), which shall act as a proxy            *
0013  *   defined in Section 14 of version 3 of the license.                    *
0014  *                                                                         *
0015  *   This program is distributed in the hope that it will be useful,       *
0016  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
0017  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
0018  *   GNU General Public License for more details.                          *
0019  *                                                                         *
0020  *   You should have received a copy of the GNU General Public License     *
0021  *   along with this program.  If not, see <http://www.gnu.org/licenses/>. *
0022  *                                                                         *
0023  ***************************************************************************/
0024 
0025 #include <config.h>
0026 #include "bibteximporter.h"
0027 #include "../utils/bibtexhandler.h"
0028 #include "../collections/bibtexcollection.h"
0029 #include "../entry.h"
0030 #include "../fieldformat.h"
0031 #include "../core/filehandler.h"
0032 #include "../tellico_debug.h"
0033 
0034 #include <KLocalizedString>
0035 #include <KSharedConfig>
0036 #include <KConfigGroup>
0037 
0038 #include <QRegularExpression>
0039 #include <QGroupBox>
0040 #include <QRadioButton>
0041 #include <QTextCodec>
0042 #include <QVBoxLayout>
0043 #include <QButtonGroup>
0044 #include <QFile>
0045 #include <QApplication>
0046 
0047 using namespace Tellico;
0048 using Tellico::Import::BibtexImporter;
0049 
0050 #ifndef ENABLE_BTPARSE
0051 void bt_cleanup() {}
0052 void bt_initialize() {}
0053 #endif
0054 
0055 int BibtexImporter::s_initCount = 0;
0056 
0057 BibtexImporter::BibtexImporter(const QList<QUrl>& urls_) : Importer(urls_)
0058     , m_widget(nullptr), m_readUTF8(nullptr), m_readLocale(nullptr), m_cancelled(false) {
0059   init();
0060 }
0061 
0062 BibtexImporter::BibtexImporter(const QString& text_) : Importer(text_)
0063     , m_widget(nullptr), m_readUTF8(nullptr), m_readLocale(nullptr), m_cancelled(false) {
0064   init();
0065 }
0066 
0067 BibtexImporter::~BibtexImporter() {
0068   --s_initCount;
0069   if(s_initCount == 0) {
0070     bt_cleanup();
0071   }
0072   if(m_readUTF8) {
0073     KConfigGroup config(KSharedConfig::openConfig(), "Import Options");
0074     config.writeEntry("Bibtex UTF8", m_readUTF8->isChecked());
0075   }
0076 }
0077 
0078 void BibtexImporter::init() {
0079   if(s_initCount == 0) {
0080     bt_initialize();
0081   }
0082   ++s_initCount;
0083 }
0084 
0085 bool BibtexImporter::canImport(int type) const {
0086   return type == Data::Collection::Bibtex;
0087 }
0088 
0089 Tellico::Data::CollPtr BibtexImporter::collection() {
0090   if(m_coll) {
0091     return m_coll;
0092   }
0093 
0094   emit signalTotalSteps(this, urls().count() * 100);
0095 
0096   bool useUTF8 = m_widget && m_readUTF8->isChecked();
0097 
0098   m_coll = new Data::BibtexCollection(true);
0099 
0100   int count = 0;
0101   // might be importing text only
0102   if(!text().isEmpty()) {
0103     QString text = this->text();
0104     Data::CollPtr coll = readCollection(text, count);
0105     if(!coll || coll->entryCount() == 0) {
0106       setStatusMessage(i18n("No valid bibtex entries were found"));
0107     } else {
0108       appendCollection(coll);
0109     }
0110   }
0111 
0112   foreach(const QUrl& url, urls()) {
0113     if(m_cancelled) {
0114       return Data::CollPtr();
0115     }
0116     if(!url.isValid()) {
0117       continue;
0118     }
0119     QString text = FileHandler::readTextFile(url, false, useUTF8);
0120     if(text.isEmpty()) {
0121       continue;
0122     }
0123     Data::CollPtr coll = readCollection(text, count);
0124     if(!coll || coll->entryCount() == 0) {
0125       setStatusMessage(i18n("No valid bibtex entries were found in file - %1", this->url().fileName()));
0126       continue;
0127     }
0128     appendCollection(coll);
0129   }
0130 
0131   if(m_cancelled) {
0132     return Data::CollPtr();
0133   }
0134 
0135   return m_coll;
0136 }
0137 
0138 Tellico::Data::CollPtr BibtexImporter::readCollection(const QString& text, int urlCount) {
0139 #ifdef ENABLE_BTPARSE
0140   if(text.isEmpty()) {
0141     myDebug() << "no text";
0142     return Data::CollPtr();
0143   }
0144   Data::CollPtr ptr(new Data::BibtexCollection(true));
0145   Data::BibtexCollection* c = static_cast<Data::BibtexCollection*>(ptr.data());
0146 
0147   parseText(text); // populates m_nodes
0148   if(m_cancelled) {
0149     return Data::CollPtr();
0150   }
0151 
0152   if(m_nodes.isEmpty()) {
0153     return Data::CollPtr();
0154   }
0155 
0156   QString str;
0157   const uint count = m_nodes.count();
0158   const uint stepSize = qMax(s_stepSize, count/100);
0159   const bool showProgress = options() & ImportProgress;
0160 
0161   Data::CollPtr currentColl = currentCollection();
0162   if(!currentColl || currentColl->type() != Data::Collection::Bibtex) {
0163     currentColl = ptr;
0164   }
0165 
0166   uint j = 0;
0167   for(int i = 0; !m_cancelled && i < m_nodes.count(); ++i, ++j) {
0168     AST* node = m_nodes[i];
0169     // if we're parsing a macro string, comment or preamble, skip it for now
0170     if(bt_entry_metatype(node) == BTE_PREAMBLE) {
0171       char* preamble = bt_get_text(node);
0172       if(preamble) {
0173         c->setPreamble(QString::fromUtf8(preamble));
0174       }
0175       continue;
0176     }
0177 
0178     if(bt_entry_metatype(node) == BTE_MACRODEF) {
0179       char* macro;
0180       (void) bt_next_field(node, nullptr, &macro);
0181       // FIXME: replace macros within macro definitions!
0182       // lookup lowercase macro in map
0183       c->addMacro(m_macros[QString::fromUtf8(macro)], QString::fromUtf8(bt_macro_text(macro, nullptr, 0)));
0184       continue;
0185     }
0186 
0187     if(bt_entry_metatype(node) == BTE_COMMENT) {
0188       continue;
0189     }
0190 
0191     // now we're parsing a regular entry
0192     Data::EntryPtr entry(new Data::Entry(ptr));
0193 
0194     str = QString::fromUtf8(bt_entry_type(node));
0195 //    myDebug() << "entry type: " << str;
0196     // text is automatically put into lower-case by btparse
0197     Data::BibtexCollection::setFieldValue(entry, QStringLiteral("entry-type"), str, currentColl);
0198 
0199     str = QString::fromUtf8(bt_entry_key(node));
0200 //    myDebug() << "entry key: " << str;
0201     Data::BibtexCollection::setFieldValue(entry, QStringLiteral("key"), str, currentColl);
0202 
0203     static const QRegularExpression andRx(QLatin1String("\\sand\\s"));
0204     char* name;
0205     AST* field = nullptr;
0206     while((field = bt_next_field(node, field, &name))) {
0207 //      myDebug() << "\tfound: " << name;
0208 //      str = QLatin1String(bt_get_text(field));
0209       str.clear();
0210       AST* value = nullptr;
0211       bt_nodetype type;
0212       char* svalue;
0213       bool end_macro = false;
0214       while((value = bt_next_value(field, value, &type, &svalue))) {
0215         switch(type) {
0216           case BTAST_STRING:
0217           case BTAST_NUMBER:
0218             str += BibtexHandler::importText(svalue).simplified();
0219             end_macro = false;
0220             break;
0221           case BTAST_MACRO:
0222             str += QString::fromUtf8(svalue) + QLatin1Char('#');
0223             end_macro = true;
0224             break;
0225           default:
0226             break;
0227         }
0228       }
0229       if(end_macro) {
0230         // remove last character '#'
0231         str.truncate(str.length() - 1);
0232       }
0233       QString fieldName = QString::fromUtf8(name);
0234       if(fieldName == QLatin1String("author") || fieldName == QLatin1String("editor")) {
0235         str.replace(andRx,FieldFormat::delimiterString());
0236       }
0237       // there's a 'key' field different from the citation key
0238       // https://nwalsh.com/tex/texhelp/bibtx-37.html
0239       // TODO account for this later
0240       if(fieldName == QLatin1String("key")) {
0241         myLog() << "skipping bibtex 'key' field for" << str;
0242       } else {
0243         Data::BibtexCollection::setFieldValue(entry, fieldName, str, currentColl);
0244       }
0245     }
0246 
0247     ptr->addEntries(entry);
0248 
0249     if(showProgress && j%stepSize == 0) {
0250       emit signalProgress(this, urlCount*100 + 100*j/count);
0251       qApp->processEvents();
0252     }
0253   }
0254 
0255   if(m_cancelled) {
0256     ptr = nullptr;
0257   }
0258 
0259   // clean-up
0260   foreach(AST* node, m_nodes) {
0261     bt_free_ast(node);
0262   }
0263 
0264   return ptr;
0265 #else
0266   return Data::CollPtr();
0267 #endif // ENABLE_BTPARSE
0268 }
0269 
0270 void BibtexImporter::parseText(const QString& text) {
0271 #ifdef ENABLE_BTPARSE
0272   m_nodes.clear();
0273   m_macros.clear();
0274 
0275   ushort bt_options = 0; // ushort is defined in btparse.h
0276   boolean ok; // boolean is defined in btparse.h as an int
0277 
0278   // for regular nodes (entries), do NOT convert numbers to strings, do NOT expand macros
0279   bt_set_stringopts(BTE_REGULAR, 0);
0280   bt_set_stringopts(BTE_MACRODEF, 0);
0281 //  bt_set_stringopts(BTE_PREAMBLE, BTO_CONVERT | BTO_EXPAND);
0282 
0283   QString entry;
0284   static const QRegularExpression rx(QLatin1String("[{}]"));
0285   static const QRegularExpression macroName(QLatin1String("@string\\s*\\{\\s*(.*?)="), QRegularExpression::CaseInsensitiveOption);
0286 
0287   int line = 1;
0288   bool needsCleanup = false;
0289   int brace = 0;
0290   int startpos = 0;
0291   QRegularExpressionMatch m = rx.match(text);
0292   int pos = m.capturedStart();
0293   while(pos > 0 && !m_cancelled) {
0294     if(text[pos] == QLatin1Char('{')) {
0295       ++brace;
0296     } else if(text[pos] == QLatin1Char('}') && brace > 0) {
0297       --brace;
0298     }
0299     if(brace == 0) {
0300       entry = text.mid(startpos, pos-startpos+1);
0301       // All the downstream text processing on the AST node will assume utf-8
0302       QByteArray entryText = entry.toUtf8();
0303       QByteArray filename = QFile::encodeName(url().fileName());
0304       AST* node = bt_parse_entry_s(entryText.data(),
0305                                    filename.data(),
0306                                    line, bt_options, &ok);
0307       if(ok && node) {
0308         QRegularExpressionMatch macroMatch = macroName.match(entry);
0309         if(bt_entry_metatype(node) == BTE_MACRODEF && macroMatch.hasMatch()) {
0310           char* macro;
0311           (void) bt_next_field(node, nullptr, &macro);
0312           m_macros.insert(QString::fromUtf8(macro), macroMatch.captured(1).trimmed());
0313         }
0314         m_nodes.append(node);
0315         needsCleanup = true;
0316       }
0317       startpos = pos+1;
0318       line += entry.count(QLatin1Char('\n'));
0319     }
0320     m = rx.match(text, pos+1);
0321     pos = m.capturedStart();
0322   }
0323   if(needsCleanup) {
0324     // clean up some structures
0325     bt_parse_entry_s(nullptr, nullptr, 1, 0, nullptr);
0326   }
0327 #endif // ENABLE_BTPARSE
0328 }
0329 
0330 void BibtexImporter::slotCancel() {
0331   m_cancelled = true;
0332 }
0333 
0334 QWidget* BibtexImporter::widget(QWidget* parent_) {
0335   if(m_widget) {
0336     return m_widget;
0337   }
0338 
0339   m_widget = new QWidget(parent_);
0340   QVBoxLayout* l = new QVBoxLayout(m_widget);
0341 
0342   QGroupBox* gbox = new QGroupBox(i18n("Bibtex Options"), m_widget);
0343   QVBoxLayout* vlay = new QVBoxLayout(gbox);
0344 
0345   m_readUTF8 = new QRadioButton(i18n("Use Unicode (UTF-8) encoding"), gbox);
0346   m_readUTF8->setWhatsThis(i18n("Read the imported file in Unicode (UTF-8)."));
0347   QString localStr = i18n("Use user locale (%1) encoding",
0348                           QLatin1String(QTextCodec::codecForLocale()->name()));
0349   m_readLocale = new QRadioButton(localStr, gbox);
0350   m_readLocale->setChecked(true);
0351   m_readLocale->setWhatsThis(i18n("Read the imported file in the local encoding."));
0352 
0353   vlay->addWidget(m_readUTF8);
0354   vlay->addWidget(m_readLocale);
0355 
0356   QButtonGroup* bg = new QButtonGroup(gbox);
0357   bg->addButton(m_readUTF8);
0358   bg->addButton(m_readLocale);
0359 
0360   KConfigGroup config(KSharedConfig::openConfig(), "Import Options");
0361   bool useUTF8 = config.readEntry("Bibtex UTF8", false);
0362   if(useUTF8) {
0363     m_readUTF8->setChecked(true);
0364   } else {
0365     m_readLocale->setChecked(true);
0366   }
0367 
0368   l->addWidget(gbox);
0369   l->addStretch(1);
0370   return m_widget;
0371 }
0372 
0373 bool BibtexImporter::maybeBibtex(const QUrl& url_) {
0374   QString text = FileHandler::readTextFile(url_, true /*quiet*/);
0375   if(text.isEmpty()) {
0376     return false;
0377   }
0378   return maybeBibtex(text, url_);
0379 }
0380 
0381 bool BibtexImporter::maybeBibtex(const QString& text, const QUrl& url_) {
0382   bool foundOne = false;
0383 #ifdef ENABLE_BTPARSE
0384   bt_initialize();
0385   static const QRegularExpression rx(QLatin1String("[{}]"));
0386 
0387   ushort bt_options = 0; // ushort is defined in btparse.h
0388   boolean ok; // boolean is defined in btparse.h as an int
0389   int brace = 0;
0390   int startpos = 0;
0391   QRegularExpressionMatch m = rx.match(text);
0392   int pos = m.capturedStart();
0393   while(pos > 0) {
0394     if(text[pos] == QLatin1Char('{')) {
0395       ++brace;
0396     } else if(text[pos] == QLatin1Char('}') && brace > 0) {
0397       --brace;
0398     }
0399     if(brace == 0) {
0400       QString entry = text.mid(startpos, pos-startpos+1).trimmed();
0401       // All the downstream text processing on the AST node will assume utf-8
0402       AST* node = bt_parse_entry_s(const_cast<char*>(entry.toUtf8().data()),
0403                                    const_cast<char*>(url_.fileName().toLocal8Bit().data()),
0404                                    0, bt_options, &ok);
0405       if(ok && node) {
0406         foundOne = true;
0407         break;
0408       }
0409       startpos = pos+1;
0410     }
0411     m = rx.match(text, pos+1);
0412     pos = m.capturedStart();
0413   }
0414   if(foundOne) {
0415     // clean up some structures
0416     bt_parse_entry_s(nullptr, nullptr, 1, 0, nullptr);
0417   }
0418   bt_cleanup();
0419 #endif // ENABLE_BTPARSE
0420   return foundOne;
0421 }
0422 
0423 void BibtexImporter::appendCollection(Data::CollPtr coll_) {
0424   Data::BibtexCollection* mainColl = static_cast<Data::BibtexCollection*>(m_coll.data());
0425   Data::BibtexCollection* newColl = static_cast<Data::BibtexCollection*>(coll_.data());
0426 
0427   foreach(Data::FieldPtr field, coll_->fields()) {
0428     m_coll->mergeField(field);
0429   }
0430 
0431   mainColl->addEntries(newColl->entries());
0432   // append the preamble and macro lists
0433   if(!newColl->preamble().isEmpty()) {
0434     QString pre = mainColl->preamble();
0435     if(!pre.isEmpty()) {
0436       pre += QLatin1Char('\n');
0437     }
0438     mainColl->setPreamble(pre + newColl->preamble());
0439   }
0440   StringMap macros = mainColl->macroList();
0441 #if (QT_VERSION < QT_VERSION_CHECK(5, 15, 0))
0442   macros.unite(newColl->macroList());
0443 #else
0444   macros.insert(newColl->macroList());
0445 #endif
0446   mainColl->setMacroList(macros);
0447 }