File indexing completed on 2024-05-19 05:05:36
0001 /*************************************************************************** 0002 * SPDX-License-Identifier: GPL-2.0-or-later 0003 * * 0004 * SPDX-FileCopyrightText: 2004-2023 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 * * 0006 * This program is free software; you can redistribute it and/or modify * 0007 * it under the terms of the GNU General Public License as published by * 0008 * the Free Software Foundation; either version 2 of the License, or * 0009 * (at your option) any later version. * 0010 * * 0011 * This program is distributed in the hope that it will be useful, * 0012 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0014 * GNU General Public License for more details. * 0015 * * 0016 * You should have received a copy of the GNU General Public License * 0017 * along with this program; if not, see <https://www.gnu.org/licenses/>. * 0018 ***************************************************************************/ 0019 0020 #include "fileimporterbibtex.h" 0021 0022 #include <QTextCodec> 0023 #include <QIODevice> 0024 #include <QRegularExpression> 0025 #include <QCoreApplication> 0026 #include <QStringList> 0027 0028 #include <BibTeXEntries> 0029 #include <BibTeXFields> 0030 #include <Preferences> 0031 #include <File> 0032 #include <Comment> 0033 #include <Macro> 0034 #include <Preamble> 0035 #include <Entry> 0036 #include <Element> 0037 #include <Value> 0038 #include "encoder.h" 0039 #include "encoderlatex.h" 0040 #include "fileimporter_p.h" 0041 #include "logging_io.h" 0042 0043 #define qint64toint(a) (static_cast<int>(qMax(0LL,qMin(0x7fffffffLL,(a))))) 0044 0045 class FileImporterBibTeX::Private 0046 { 0047 private: 0048 FileImporterBibTeX *parent; 0049 0050 public: 0051 static const QStringList keysForPersonDetection; 0052 0053 /// Set via @see setCommentHandling 0054 CommentHandling commentHandling; 0055 0056 enum class Token { 0057 At = 1, BracketOpen = 2, BracketClose = 3, AlphaNumText = 4, Comma = 5, Assign = 6, Doublecross = 7, EndOfFile = 0xffff, Unknown = -1 0058 }; 0059 0060 enum class CommaContainment { None, Contains }; 0061 0062 typedef struct Statistics { 0063 /// Used to determine if file prefers quotation marks over 0064 /// curly brackets or the other way around 0065 int countCurlyBrackets; 0066 0067 int countQuotationMarks, countFirstNameFirst, countLastNameFirst; 0068 QHash<QString, int> countCommentContext; 0069 int countProtectedTitle, countUnprotectedTitle; 0070 int countSortedByIdentifier, countNotSortedByIdentifier; 0071 QString mostRecentListSeparator; 0072 0073 Statistics() 0074 : countCurlyBrackets(0), countQuotationMarks(0), countFirstNameFirst(0), 0075 countLastNameFirst(0), countProtectedTitle(0), countUnprotectedTitle(0), 0076 countSortedByIdentifier(0), countNotSortedByIdentifier(0) 0077 { 0078 /// nothing 0079 } 0080 } Statistics; 0081 0082 typedef struct State { 0083 QTextStream *textStream; 0084 /// Low-level character operations 0085 QChar prevChar, nextChar; 0086 /// Current line 0087 int lineNo; 0088 QString prevLine, currentLine; 0089 QSet<QString> knownElementIds; 0090 0091 State(QTextStream *_textStream) 0092 : textStream(_textStream), lineNo(1) 0093 { 0094 /// nothing 0095 } 0096 } State; 0097 0098 Private(FileImporterBibTeX *p) 0099 : parent(p), commentHandling(CommentHandling::Ignore) 0100 { 0101 // TODO 0102 } 0103 0104 bool readChar(State &state) 0105 { 0106 /// Memorize previous char 0107 state.prevChar = state.nextChar; 0108 0109 if (state.textStream->atEnd()) { 0110 /// At end of data stream 0111 state.nextChar = QChar::Null; 0112 return false; 0113 } 0114 0115 /// Read next char 0116 *state.textStream >> state.nextChar; 0117 0118 /// Test for new line 0119 if (state.nextChar == QLatin1Char('\n')) { 0120 /// Update variables tracking line numbers and line content 0121 ++state.lineNo; 0122 state.prevLine = state.currentLine; 0123 state.currentLine.clear(); 0124 } else { 0125 /// Add read char to current line 0126 state.currentLine.append(state.nextChar); 0127 } 0128 0129 return true; 0130 } 0131 0132 bool skipWhiteChar(State &state) 0133 { 0134 bool result = true; 0135 while ((state.nextChar.isSpace() || state.nextChar == QLatin1Char('\t') || state.nextChar == QLatin1Char('\n') || state.nextChar == QLatin1Char('\r')) && result) result = readChar(state); 0136 return result; 0137 } 0138 0139 bool skipNewline(State &state) 0140 { 0141 if (state.nextChar == QLatin1Char('\r')) { 0142 const bool result = readChar(state); 0143 if (result && state.nextChar == QLatin1Char('\n')) 0144 // Windows linebreak: CR LF 0145 return readChar(state); 0146 } else if (state.nextChar == QLatin1Char('\n')) { 0147 // Linux/Unix linebreak: LF 0148 return readChar(state); 0149 } 0150 return false; 0151 } 0152 0153 Token nextToken(State &state) 0154 { 0155 if (!skipWhiteChar(state)) { 0156 /// Some error occurred while reading from data stream 0157 return Token::EndOfFile; 0158 } 0159 0160 Token result = Token::Unknown; 0161 0162 switch (state.nextChar.toLatin1()) { 0163 case '@': 0164 result = Token::At; 0165 break; 0166 case '{': 0167 case '(': 0168 result = Token::BracketOpen; 0169 break; 0170 case '}': 0171 case ')': 0172 result = Token::BracketClose; 0173 break; 0174 case ',': 0175 result = Token::Comma; 0176 break; 0177 case '=': 0178 result = Token::Assign; 0179 break; 0180 case '#': 0181 result = Token::Doublecross; 0182 break; 0183 default: 0184 if (state.textStream->atEnd()) 0185 result = Token::EndOfFile; 0186 } 0187 0188 if (state.nextChar != QLatin1Char('%')) { 0189 /// Unclean solution, but necessary for comments 0190 /// that have a percent sign as a prefix 0191 readChar(state); 0192 } 0193 return result; 0194 } 0195 0196 // FIXME duplicate 0197 static void parsePersonList(const QString &text, Value &value, CommaContainment *comma, const int line_number, QObject *parent) 0198 { 0199 static const QString tokenAnd = QStringLiteral("and"); 0200 static const QString tokenOthers = QStringLiteral("others"); 0201 static QStringList tokens; 0202 contextSensitiveSplit(text, tokens); 0203 0204 if (tokens.count() > 0) { 0205 if (tokens[0] == tokenAnd) { 0206 qCInfo(LOG_KBIBTEX_IO) << "Person list starts with" << tokenAnd << "near line" << line_number; 0207 if (parent != nullptr) 0208 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0209 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list starts with 'and' near line %1")).arg(line_number))); 0210 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0211 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list starts with 'and' near line %1")).arg(line_number))); 0212 #endif 0213 } else if (tokens.count() > 1 && tokens[tokens.count() - 1] == tokenAnd) { 0214 qCInfo(LOG_KBIBTEX_IO) << "Person list ends with" << tokenAnd << "near line" << line_number; 0215 if (parent != nullptr) 0216 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0217 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list ends with 'and' near line %1")).arg(line_number))); 0218 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0219 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list ends with 'and' near line %1")).arg(line_number))); 0220 #endif 0221 } 0222 if (tokens[0] == tokenOthers) { 0223 qCInfo(LOG_KBIBTEX_IO) << "Person list starts with" << tokenOthers << "near line" << line_number; 0224 if (parent != nullptr) 0225 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0226 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list starts with 'others' near line %1")).arg(line_number))); 0227 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0228 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list starts with 'others' near line %1")).arg(line_number))); 0229 #endif 0230 } else if (tokens[tokens.count() - 1] == tokenOthers && (tokens.count() < 3 || tokens[tokens.count() - 2] != tokenAnd)) { 0231 qCInfo(LOG_KBIBTEX_IO) << "Person list ends with" << tokenOthers << "but is not preceded with name and" << tokenAnd << "near line" << line_number; 0232 if (parent != nullptr) 0233 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0234 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list ends with 'others' but is not preceded with name and 'and' near line %1")).arg(line_number))); 0235 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0236 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Person list ends with 'others' but is not preceded with name and 'and' near line %1")).arg(line_number))); 0237 #endif 0238 } 0239 } 0240 0241 int nameStart = 0; 0242 QString prevToken; 0243 for (int i = 0; i < tokens.count(); ++i) { 0244 if (tokens[i] == tokenAnd) { 0245 if (prevToken == tokenAnd) { 0246 qCInfo(LOG_KBIBTEX_IO) << "Two subsequent" << tokenAnd << "found in person list near line" << line_number; 0247 if (parent != nullptr) 0248 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0249 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Two subsequent 'and' found in person list near line %1")).arg(line_number))); 0250 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0251 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Two subsequent 'and' found in person list near line %1")).arg(line_number))); 0252 #endif 0253 } else if (nameStart < i) { 0254 const QSharedPointer<Person> person = personFromTokenList(tokens.mid(nameStart, i - nameStart), comma, line_number, parent); 0255 if (!person.isNull()) 0256 value.append(person); 0257 else { 0258 qCInfo(LOG_KBIBTEX_IO) << "Text" << tokens.mid(nameStart, i - nameStart).join(QLatin1Char(' ')) << "does not form a name near line" << line_number; 0259 if (parent != nullptr) 0260 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0261 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Text '%1' does not form a name near line %2")).arg(tokens.mid(nameStart, i - nameStart).join(QLatin1Char(' '))).arg(line_number))); 0262 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0263 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Text '%1' does not form a name near line %2")).arg(tokens.mid(nameStart, i - nameStart).join(QLatin1Char(' '))).arg(line_number))); 0264 #endif 0265 } 0266 } else { 0267 qCInfo(LOG_KBIBTEX_IO) << "Found" << tokenAnd << "but no name before it near line" << line_number; 0268 if (parent != nullptr) 0269 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0270 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Found 'and' but no name before it near line %1")).arg(line_number))); 0271 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0272 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Found 'and' but no name before it near line %1")).arg(line_number))); 0273 #endif 0274 } 0275 nameStart = i + 1; 0276 } else if (tokens[i] == tokenOthers) { 0277 if (i < tokens.count() - 1) { 0278 qCInfo(LOG_KBIBTEX_IO) << "Special word" << tokenOthers << "found before last position in person name near line" << line_number; 0279 if (parent != nullptr) 0280 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0281 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Special word 'others' found before last position in person name near line %1")).arg(line_number))); 0282 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0283 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Special word 'others' found before last position in person name near line %1")).arg(line_number))); 0284 #endif 0285 } else 0286 value.append(QSharedPointer<PlainText>(new PlainText(QStringLiteral("others")))); 0287 nameStart = tokens.count() + 1; 0288 } 0289 prevToken = tokens[i]; 0290 } 0291 0292 if (nameStart < tokens.count()) { 0293 const QSharedPointer<Person> person = personFromTokenList(tokens.mid(nameStart), comma, line_number, parent); 0294 if (!person.isNull()) 0295 value.append(person); 0296 else { 0297 qCInfo(LOG_KBIBTEX_IO) << "Text" << tokens.mid(nameStart).join(QLatin1Char(' ')) << "does not form a name near line" << line_number; 0298 if (parent != nullptr) 0299 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0300 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Text '%1' does not form a name near line %2")).arg(tokens.mid(nameStart).join(QLatin1Char(' '))).arg(line_number))); 0301 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0302 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Text '%1' does not form a name near line %2")).arg(tokens.mid(nameStart).join(QLatin1Char(' '))).arg(line_number))); 0303 #endif 0304 } 0305 } 0306 } 0307 0308 Token readValue(Value &value, const QString &key, Statistics &statistics, State &state) 0309 { 0310 Token token = Token::Unknown; 0311 const QString iKey = key.toLower(); 0312 static const QSet<QString> verbatimKeys {Entry::ftColor.toLower(), Entry::ftCrossRef.toLower(), Entry::ftXData.toLower()}; 0313 0314 do { 0315 bool isStringKey = false; 0316 const QString rawText = readString(isStringKey, statistics, state); 0317 if (rawText.isNull()) 0318 return Token::EndOfFile; 0319 QString text = EncoderLaTeX::instance().decode(rawText); 0320 /// For all entries except for abstracts and a few more 'verbatim-y' fields ... 0321 if (iKey != Entry::ftAbstract && !(iKey.startsWith(Entry::ftUrl) && !iKey.startsWith(Entry::ftUrlDate)) && !iKey.startsWith(Entry::ftLocalFile) && !iKey.startsWith(Entry::ftFile)) { 0322 /// ... remove redundant spaces including newlines 0323 text = bibtexAwareSimplify(text); 0324 } 0325 /// Abstracts will keep their formatting (regarding line breaks) 0326 /// as requested by Thomas Jensch via mail (20 October 2010) 0327 0328 /// Maintain statistics on if (book) titles are protected 0329 /// by surrounding curly brackets 0330 if (!text.isEmpty() && (iKey == Entry::ftTitle || iKey == Entry::ftBookTitle)) { 0331 if (text[0] == QLatin1Char('{') && text[text.length() - 1] == QLatin1Char('}')) 0332 ++statistics.countProtectedTitle; 0333 else 0334 ++statistics.countUnprotectedTitle; 0335 } 0336 0337 if (keysForPersonDetection.contains(iKey)) { 0338 if (isStringKey) 0339 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0340 else { 0341 CommaContainment comma = CommaContainment::Contains; 0342 parsePersonList(text, value, &comma, state.lineNo, parent); 0343 0344 /// Update statistics on name formatting 0345 if (comma == CommaContainment::Contains) 0346 ++statistics.countLastNameFirst; 0347 else 0348 ++statistics.countFirstNameFirst; 0349 } 0350 } else if (iKey == Entry::ftPages) { 0351 static const QRegularExpression rangeInAscii(QStringLiteral("\\s*--?\\s*")); 0352 text.replace(rangeInAscii, QChar(0x2013)); 0353 if (isStringKey) 0354 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0355 else 0356 value.append(QSharedPointer<PlainText>(new PlainText(text))); 0357 } else if ((iKey.startsWith(Entry::ftUrl) && !iKey.startsWith(Entry::ftUrlDate)) || iKey.startsWith(Entry::ftLocalFile) || iKey.startsWith(Entry::ftFile) || iKey == QStringLiteral("ee") || iKey == QStringLiteral("biburl")) { 0358 if (isStringKey) 0359 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0360 else { 0361 /// Assumption: in fields like Url or LocalFile, file names are separated by ; 0362 static const QRegularExpression semicolonSpace = QRegularExpression(QStringLiteral("[;]\\s*")); 0363 #if QT_VERSION >= 0x050e00 0364 const QStringList fileList = rawText.split(semicolonSpace, Qt::SkipEmptyParts); 0365 #else // QT_VERSION < 0x050e00 0366 const QStringList fileList = rawText.split(semicolonSpace, QString::SkipEmptyParts); 0367 #endif // QT_VERSION >= 0x050e00 0368 for (QString filename : fileList) { 0369 QString comment; 0370 bool hasComment = false; ///< need to have extra flag for comment, as even an empty comment counts as comment 0371 if (iKey == Entry::ftFile) { 0372 /// Check 'file' field for a JabRef-specific formatting, extract filename 0373 /// Example of JabRef-specific value: 0374 /// Some optional text:path/to/file\_name.pdf:PDF 0375 /// Regular expression will try to extract filename, then decode some LaTeX-isms 0376 /// to get path/to/file_name.pdf for in above example 0377 static const QRegularExpression jabrefFileRegExp(QStringLiteral("^([^:]*):(.*?):([A-Z]+|pdf)$")); 0378 const QRegularExpressionMatch jabrefFileRegExpMatch = jabrefFileRegExp.match(filename); 0379 if (jabrefFileRegExpMatch.hasMatch()) { 0380 hasComment = true; 0381 comment = EncoderLaTeX::instance().decode(jabrefFileRegExpMatch.captured(1)); 0382 filename = EncoderLaTeX::instance().decode(jabrefFileRegExpMatch.captured(2)); 0383 0384 /// Furthermore, if the file came from Windows, drive letters may have been written as follows: 0385 /// C$\backslash$:/Users/joedoe/filename.pdf 0386 static const QRegularExpression windowsDriveBackslashRegExp(QStringLiteral("^([A-Z])\\$\\\\backslash\\$(:.*)$")); 0387 const QRegularExpressionMatch windowsDriveBackslashRegExpMatch = windowsDriveBackslashRegExp.match(filename); 0388 if (windowsDriveBackslashRegExpMatch.hasMatch()) { 0389 filename = windowsDriveBackslashRegExpMatch.captured(1) + windowsDriveBackslashRegExpMatch.captured(2); 0390 } else if (filename.startsWith(QStringLiteral("home/"))) { 0391 /// If filename is a relative path but by name looks almost like it should be an absolute path 0392 /// (starting with some suspicious strings), prepend a slash 0393 filename.prepend(QLatin1Char('/')); 0394 } 0395 } 0396 } 0397 0398 VerbatimText *verbatimText = new VerbatimText(filename); 0399 if (hasComment) 0400 verbatimText->setComment(comment); 0401 value.append(QSharedPointer<VerbatimText>(verbatimText)); 0402 } 0403 } 0404 } else if (iKey == Entry::ftMonth) { 0405 if (isStringKey) { 0406 static const QRegularExpression monthThreeChars(QStringLiteral("^[a-z]{3}"), QRegularExpression::CaseInsensitiveOption); 0407 if (monthThreeChars.match(text).hasMatch()) 0408 text = text.left(3).toLower(); 0409 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0410 } else 0411 value.append(QSharedPointer<PlainText>(new PlainText(text))); 0412 } else if (iKey.startsWith(Entry::ftDOI)) { 0413 if (isStringKey) 0414 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0415 else { 0416 /// Take care of "; " which separates multiple DOIs, but which may baffle the regexp 0417 QString preprocessedText = rawText; 0418 preprocessedText.replace(QStringLiteral("; "), QStringLiteral(" ")); 0419 /// Extract everything that looks like a DOI using a regular expression, 0420 /// ignore everything else 0421 QRegularExpressionMatchIterator doiRegExpMatchIt = KBibTeX::doiRegExp.globalMatch(preprocessedText); 0422 while (doiRegExpMatchIt.hasNext()) { 0423 const QRegularExpressionMatch doiRegExpMatch = doiRegExpMatchIt.next(); 0424 value.append(QSharedPointer<VerbatimText>(new VerbatimText(doiRegExpMatch.captured(QStringLiteral("doi"))))); 0425 } 0426 } 0427 } else if (iKey == Entry::ftKeywords) { 0428 if (isStringKey) 0429 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0430 else { 0431 char splitChar; 0432 const QList<QSharedPointer<Keyword> > keywords = splitKeywords(text, &splitChar); 0433 for (const auto &keyword : keywords) 0434 value.append(keyword); 0435 /// Memorize (some) split characters for later use 0436 /// (e.g. when writing file again) 0437 if (splitChar == ';') 0438 statistics.mostRecentListSeparator = QStringLiteral("; "); 0439 else if (splitChar == ',') 0440 statistics.mostRecentListSeparator = QStringLiteral(", "); 0441 0442 } 0443 } else if (verbatimKeys.contains(iKey)) { 0444 if (isStringKey) 0445 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0446 else 0447 value.append(QSharedPointer<VerbatimText>(new VerbatimText(rawText))); 0448 } else { 0449 if (isStringKey) 0450 value.append(QSharedPointer<MacroKey>(new MacroKey(text))); 0451 else 0452 value.append(QSharedPointer<PlainText>(new PlainText(text))); 0453 } 0454 0455 token = nextToken(state); 0456 } while (token == Token::Doublecross); 0457 0458 return token; 0459 } 0460 0461 QString readBracketString(State &state) 0462 { 0463 static const QChar backslash = QLatin1Char('\\'); 0464 QString result(0, QChar()); ///< Construct an empty but non-null string 0465 const QChar openingBracket = state.nextChar; 0466 const QChar closingBracket = openingBracket == QLatin1Char('{') ? QLatin1Char('}') : (openingBracket == QLatin1Char('(') ? QLatin1Char(')') : QChar()); 0467 Q_ASSERT_X(!closingBracket.isNull(), "QString FileImporterBibTeX::readBracketString()", "openingBracket==state.nextChar is neither '{' nor '('"); 0468 int counter = 1; 0469 0470 if (!readChar(state)) { 0471 /// Some error occurred while reading from data stream 0472 return QString(); ///< return null QString 0473 } 0474 0475 while (!state.nextChar.isNull()) { 0476 if (state.nextChar == openingBracket && state.prevChar != backslash) 0477 ++counter; 0478 else if (state.nextChar == closingBracket && state.prevChar != backslash) 0479 --counter; 0480 0481 if (counter == 0) { 0482 break; 0483 } else 0484 result.append(state.nextChar); 0485 0486 if (!readChar(state)) { 0487 /// Some error occurred while reading from data stream 0488 return QString(); ///< return null QString 0489 } 0490 } 0491 0492 if (!readChar(state)) { 0493 /// Some error occurred while reading from data stream 0494 return QString(); ///< return null QString 0495 } 0496 return result; 0497 } 0498 0499 QString readSimpleString(State &state, const QString &until = QString(), const bool readNestedCurlyBrackets = false) 0500 { 0501 static const QString extraAlphaNumChars = QString(QStringLiteral("?'`-_:.+/$\\\"&")); 0502 0503 QString result; ///< 'result' is Null on purpose: simple strings cannot be empty in contrast to e.g. quoted strings 0504 0505 if (!skipWhiteChar(state)) { 0506 /// Some error occurred while reading from data stream 0507 return QString(); ///< return null QString 0508 } 0509 0510 QChar prevChar = QChar(0x00); 0511 while (!state.nextChar.isNull()) { 0512 if (readNestedCurlyBrackets && state.nextChar == QLatin1Char('{') && prevChar != QLatin1Char('\\')) { 0513 int depth = 1; 0514 while (depth > 0) { 0515 result.append(state.nextChar); 0516 prevChar = state.nextChar; 0517 if (!readChar(state)) return result; 0518 if (state.nextChar == QLatin1Char('{') && prevChar != QLatin1Char('\\')) ++depth; 0519 else if (state.nextChar == QLatin1Char('}') && prevChar != QLatin1Char('\\')) --depth; 0520 } 0521 result.append(state.nextChar); 0522 prevChar = state.nextChar; 0523 if (!readChar(state)) return result; 0524 } 0525 0526 const ushort nextCharUnicode = state.nextChar.unicode(); 0527 if (!until.isEmpty()) { 0528 /// Variable "until" has user-defined value 0529 if (state.nextChar == QLatin1Char('\n') || state.nextChar == QLatin1Char('\r') || until.contains(state.nextChar)) { 0530 /// Force break on line-breaks or if one of the "until" chars has been read 0531 break; 0532 } else { 0533 /// Append read character to final result 0534 result.append(state.nextChar); 0535 } 0536 } else if ((nextCharUnicode >= static_cast<ushort>('a') && nextCharUnicode <= static_cast<ushort>('z')) || (nextCharUnicode >= static_cast<ushort>('A') && nextCharUnicode <= static_cast<ushort>('Z')) || (nextCharUnicode >= static_cast<ushort>('0') && nextCharUnicode <= static_cast<ushort>('9')) || extraAlphaNumChars.contains(state.nextChar)) { 0537 /// Accept default set of alpha-numeric characters 0538 result.append(state.nextChar); 0539 } else 0540 break; 0541 prevChar = state.nextChar; 0542 if (!readChar(state)) break; 0543 } 0544 return result; 0545 } 0546 0547 QString readQuotedString(State &state) 0548 { 0549 QString result(0, QChar()); ///< Construct an empty but non-null string 0550 0551 Q_ASSERT_X(state.nextChar == QLatin1Char('"'), "QString FileImporterBibTeX::readQuotedString()", "state.nextChar is not '\"'"); 0552 0553 if (!readChar(state)) { 0554 /// Some error occurred while reading from data stream 0555 return QString(); ///< return null QString 0556 } 0557 0558 while (!state.nextChar.isNull()) { 0559 if (state.nextChar == QLatin1Char('"') && state.prevChar != QLatin1Char('\\') && state.prevChar != QLatin1Char('{')) 0560 break; 0561 else 0562 result.append(state.nextChar); 0563 0564 if (!readChar(state)) { 0565 /// Some error occurred while reading from data stream 0566 return QString(); ///< return null QString 0567 } 0568 } 0569 0570 if (!readChar(state)) { 0571 /// Some error occurred while reading from data stream 0572 return QString(); ///< return null QString 0573 } 0574 0575 /// Remove protection around quotation marks 0576 result.replace(QStringLiteral("{\"}"), QStringLiteral("\"")); 0577 0578 return result; 0579 } 0580 0581 QString readString(bool &isStringKey, Statistics &statistics, State &state) 0582 { 0583 /// Most often it is not a string key 0584 isStringKey = false; 0585 0586 if (!skipWhiteChar(state)) { 0587 /// Some error occurred while reading from data stream 0588 return QString(); ///< return null QString 0589 } 0590 0591 switch (state.nextChar.toLatin1()) { 0592 case '{': 0593 case '(': { 0594 ++statistics.countCurlyBrackets; 0595 const QString result = readBracketString(state); 0596 return result; 0597 } 0598 case '"': { 0599 ++statistics.countQuotationMarks; 0600 const QString result = readQuotedString(state); 0601 return result; 0602 } 0603 default: 0604 isStringKey = true; 0605 const QString result = readSimpleString(state); 0606 return result; 0607 } 0608 } 0609 0610 bool readCharUntil(const QString &until, State &state) 0611 { 0612 Q_ASSERT_X(!until.isEmpty(), "bool FileImporterBibTeX::readCharUntil(const QString &until)", "\"until\" is empty or invalid"); 0613 bool result = true; 0614 while (!until.contains(state.nextChar) && (result = readChar(state))); 0615 return result; 0616 } 0617 0618 QString readLine(State &state) 0619 { 0620 QString result; 0621 while (state.nextChar != QLatin1Char('\n') && state.nextChar != QLatin1Char('\r') && readChar(state)) 0622 result.append(state.nextChar); 0623 return result; 0624 } 0625 0626 Macro *readMacroElement(Statistics &statistics, State &state) 0627 { 0628 Token token = nextToken(state); 0629 while (token != Token::BracketOpen) { 0630 if (token == Token::EndOfFile) { 0631 #if QT_VERSION >= 0x050e00 0632 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing macro near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Opening curly brace '{' expected"; 0633 #else // QT_VERSION < 0x050e00 0634 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing macro near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Opening curly brace '{' expected"; 0635 #endif // QT_VERSION >= 0x050e00 0636 /// Instead of an 'emit' ... 0637 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0638 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing macro near line %1: Opening curly brace '{' expected")).arg(state.lineNo))); 0639 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0640 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing macro near line %1: Opening curly brace '{' expected")).arg(state.lineNo))); 0641 #endif 0642 return nullptr; 0643 } 0644 token = nextToken(state); 0645 } 0646 0647 QString key = readSimpleString(state); 0648 0649 if (key.isEmpty()) { 0650 /// Cope with empty keys, 0651 /// duplicates are handled further below 0652 key = QStringLiteral("EmptyId"); 0653 } else if (!Encoder::containsOnlyAscii(key)) { 0654 /// Try to avoid non-ascii characters in ids 0655 const QString newKey = Encoder::instance().convertToPlainAscii(key); 0656 qCWarning(LOG_KBIBTEX_IO) << "Macro key" << key << "near line" << state.lineNo << "contains non-ASCII characters, converted to" << newKey; 0657 /// Instead of an 'emit' ... 0658 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0659 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Macro key '%1' near line %2 contains non-ASCII characters, converted to '%3'")).arg(key).arg(state.lineNo).arg(newKey))); 0660 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0661 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Macro key '%1' near line %2 contains non-ASCII characters, converted to '%3'")).arg(key).arg(state.lineNo).arg(newKey))); 0662 #endif 0663 key = newKey; 0664 } 0665 0666 /// Check for duplicate entry ids, avoid collisions 0667 if (state.knownElementIds.contains(key)) { 0668 static const QString newIdPattern = QStringLiteral("%1-%2"); 0669 int idx = 2; 0670 QString newKey = newIdPattern.arg(key).arg(idx); 0671 while (state.knownElementIds.contains(newKey)) 0672 newKey = newIdPattern.arg(key).arg(++idx); 0673 qCDebug(LOG_KBIBTEX_IO) << "Duplicate macro key" << key << ", using replacement key" << newKey; 0674 /// Instead of an 'emit' ... 0675 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0676 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Duplicate macro key '%1', using replacement key '%2'")).arg(key, newKey))); 0677 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0678 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Duplicate macro key '%1', using replacement key '%2'")).arg(key, newKey))); 0679 #endif 0680 key = newKey; 0681 } 0682 state.knownElementIds.insert(key); 0683 0684 if (nextToken(state) != Token::Assign) { 0685 #if QT_VERSION >= 0x050e00 0686 qCCritical(LOG_KBIBTEX_IO) << "Error in parsing macro" << key << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Assign symbol '=' expected"; 0687 #else // QT_VERSION < 0x050e00 0688 qCCritical(LOG_KBIBTEX_IO) << "Error in parsing macro" << key << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Assign symbol '=' expected"; 0689 #endif // QT_VERSION >= 0x050e00 0690 /// Instead of an 'emit' ... 0691 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0692 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing macro '%1' near line %2: Assign symbol '=' expected")).arg(key).arg(state.lineNo))); 0693 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0694 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing macro '%1' near line %2: Assign symbol '=' expected")).arg(key).arg(state.lineNo))); 0695 #endif 0696 return nullptr; 0697 } 0698 0699 Macro *macro = new Macro(key); 0700 do { 0701 bool isStringKey = false; 0702 QString text = readString(isStringKey, statistics, state); 0703 if (text.isNull()) { 0704 #if QT_VERSION >= 0x050e00 0705 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing macro" << key << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Could not read macro's text"; 0706 #else // QT_VERSION < 0x050e00 0707 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing macro" << key << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Could not read macro's text"; 0708 #endif // QT_VERSION >= 0x050e00 0709 /// Instead of an 'emit' ... 0710 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0711 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing macro '%1' near line %2: Could not read macro's text")).arg(key).arg(state.lineNo))); 0712 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0713 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing macro '%1' near line %2: Could not read macro's text")).arg(key).arg(state.lineNo))); 0714 #endif 0715 delete macro; 0716 return nullptr; 0717 } 0718 text = EncoderLaTeX::instance().decode(bibtexAwareSimplify(text)); 0719 if (isStringKey) 0720 macro->value().append(QSharedPointer<MacroKey>(new MacroKey(text))); 0721 else 0722 macro->value().append(QSharedPointer<PlainText>(new PlainText(text))); 0723 0724 token = nextToken(state); 0725 } while (token == Token::Doublecross); 0726 0727 return macro; 0728 } 0729 0730 Comment *readCommentElement(State &state) 0731 { 0732 if (!readCharUntil(QStringLiteral("{("), state)) 0733 return nullptr; 0734 return new Comment(EncoderLaTeX::instance().decode(readBracketString(state)), Preferences::CommentContext::Command); 0735 } 0736 0737 Comment *readPlainCommentElement(const QString &initialRead, State &state) 0738 { 0739 const QString firstLine {rstrip(EncoderLaTeX::instance().decode(initialRead + readLine(state)))}; 0740 if (firstLine.length() > 0 && firstLine[0] == QLatin1Char('%')) { 0741 QStringList lines{{firstLine}}; 0742 // Read all lines that start with '%', compute common prefix, and remove prefix from all lines 0743 // Stop when encountering a line that starts without '%' 0744 while (skipNewline(state) && state.nextChar == QLatin1Char('%')) { 0745 const QString nextLine {rstrip(EncoderLaTeX::instance().decode(QStringLiteral("%") + readLine(state)))}; 0746 lines.append(nextLine); 0747 } 0748 0749 int commonPrefixLen {0}; 0750 for (; commonPrefixLen < firstLine.length(); ++commonPrefixLen) 0751 if (firstLine[commonPrefixLen] != QLatin1Char(' ') && firstLine[commonPrefixLen] != QLatin1Char('%')) 0752 break; 0753 int longestLinLength = firstLine.length(); 0754 bool first = true; 0755 for (const QString &line : lines) { 0756 if (first) { 0757 first = false; 0758 continue; 0759 } 0760 commonPrefixLen = qMin(commonPrefixLen, line.length()); 0761 longestLinLength = qMax(longestLinLength, line.length()); 0762 for (int i = 0; i < commonPrefixLen; ++i) 0763 if (line[i] != firstLine[i]) { 0764 commonPrefixLen = i; 0765 break; 0766 } 0767 } 0768 const QString prefix {firstLine.left(commonPrefixLen)}; 0769 QString text; 0770 text.reserve(longestLinLength * lines.length()); 0771 for (const QString &line : lines) 0772 text.append(line.mid(commonPrefixLen)).append(QStringLiteral("\n")); 0773 0774 return new Comment(rstrip(text), Preferences::CommentContext::Prefix, prefix); 0775 } else if (firstLine.length() > 0) { 0776 QStringList lines{{firstLine}}; 0777 // Read all lines until a line is either empty or starts with '@' 0778 while (skipNewline(state) && state.nextChar != QLatin1Char('\n') && state.nextChar != QLatin1Char('\r') && state.nextChar != QLatin1Char('@')) { 0779 const QChar firstLineChar {state.nextChar}; 0780 const QString nextLine {rstrip(EncoderLaTeX::instance().decode(QString(firstLineChar) + readLine(state)))}; 0781 lines.append(nextLine); 0782 } 0783 return new Comment(lines.join(QStringLiteral("\n")), Preferences::CommentContext::Verbatim); 0784 } else { 0785 // Maybe a line with only spaces? 0786 return nullptr; 0787 } 0788 } 0789 0790 QString tokenidToString(Token token) 0791 { 0792 switch (token) { 0793 case Token::At: return QString(QStringLiteral("At")); 0794 case Token::BracketClose: return QString(QStringLiteral("BracketClose")); 0795 case Token::BracketOpen: return QString(QStringLiteral("BracketOpen")); 0796 case Token::AlphaNumText: return QString(QStringLiteral("AlphaNumText")); 0797 case Token::Assign: return QString(QStringLiteral("Assign")); 0798 case Token::Comma: return QString(QStringLiteral("Comma")); 0799 case Token::Doublecross: return QString(QStringLiteral("Doublecross")); 0800 case Token::EndOfFile: return QString(QStringLiteral("EOF")); 0801 case Token::Unknown: return QString(QStringLiteral("Unknown")); 0802 default: { 0803 qCWarning(LOG_KBIBTEX_IO) << "Encountered an unsupported Token:" << static_cast<int>(token); 0804 return QString(QStringLiteral("Unknown")); 0805 } 0806 } 0807 } 0808 0809 Preamble *readPreambleElement(Statistics &statistics, State &state) 0810 { 0811 Token token = nextToken(state); 0812 while (token != Token::BracketOpen) { 0813 if (token == Token::EndOfFile) { 0814 #if QT_VERSION >= 0x050e00 0815 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing preamble near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Opening curly brace '{' expected"; 0816 #else // QT_VERSION < 0x050e00 0817 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing preamble near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Opening curly brace '{' expected"; 0818 #endif // QT_VERSION >= 0x050e00 0819 /// Instead of an 'emit' ... 0820 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0821 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing preamble near line %1: Opening curly brace '{' expected")).arg(state.lineNo))); 0822 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0823 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing preamble near line %1: Opening curly brace '{' expected")).arg(state.lineNo))); 0824 #endif 0825 return nullptr; 0826 } 0827 token = nextToken(state); 0828 } 0829 0830 Preamble *preamble = new Preamble(); 0831 do { 0832 bool isStringKey = false; 0833 QString text = readString(isStringKey, statistics, state); 0834 if (text.isNull()) { 0835 #if QT_VERSION >= 0x050e00 0836 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing preamble near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Could not read preamble's text"; 0837 #else // QT_VERSION < 0x050e00 0838 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing preamble near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Could not read preamble's text"; 0839 #endif // QT_VERSION >= 0x050e00 0840 /// Instead of an 'emit' ... 0841 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0842 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing preamble near line %1: Could not read preamble's text")).arg(state.lineNo))); 0843 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0844 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing preamble near line %1: Could not read preamble's text")).arg(state.lineNo))); 0845 #endif 0846 delete preamble; 0847 return nullptr; 0848 } 0849 /// Remember: strings from preamble do not get encoded, 0850 /// may contain raw LaTeX commands and code 0851 text = bibtexAwareSimplify(text); 0852 if (isStringKey) 0853 preamble->value().append(QSharedPointer<MacroKey>(new MacroKey(text))); 0854 else 0855 preamble->value().append(QSharedPointer<PlainText>(new PlainText(text))); 0856 0857 token = nextToken(state); 0858 } while (token == Token::Doublecross); 0859 0860 return preamble; 0861 } 0862 0863 Entry *readEntryElement(const QString &typeString, Statistics &statistics, State &state) 0864 { 0865 const KBibTeX::Casing keywordCasing = Preferences::instance().bibTeXKeywordCasing(); 0866 0867 Token token = nextToken(state); 0868 while (token != Token::BracketOpen) { 0869 if (token == Token::EndOfFile) { 0870 #if QT_VERSION >= 0x050e00 0871 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Opening curly brace '{' expected"; 0872 #else // QT_VERSION < 0x050e00 0873 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Opening curly brace '{' expected"; 0874 #endif // QT_VERSION >= 0x050e00 0875 /// Instead of an 'emit' ... 0876 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0877 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry near line %1: Opening curly brace '{' expected")).arg(state.lineNo))); 0878 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0879 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry near line %1: Opening curly brace '{' expected")).arg(state.lineNo))); 0880 #endif 0881 return nullptr; 0882 } 0883 token = nextToken(state); 0884 } 0885 0886 QString id = readSimpleString(state, QStringLiteral(",}"), true).trimmed(); 0887 if (id.isEmpty()) { 0888 if (state.nextChar == QLatin1Char(',') || state.nextChar == QLatin1Char('}')) { 0889 /// Cope with empty ids, 0890 /// duplicates are handled further below 0891 id = QStringLiteral("EmptyId"); 0892 } 0893 else { 0894 #if QT_VERSION >= 0x050e00 0895 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry near line" << state.lineNo << ":" << state.prevLine << Qt::endl << state.currentLine << "): Could not read entry id"; 0896 #else // QT_VERSION < 0x050e00 0897 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry near line" << state.lineNo << ":" << state.prevLine << endl << state.currentLine << "): Could not read entry id"; 0898 #endif // QT_VERSION >= 0x050e00 0899 /// Instead of an 'emit' ... 0900 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0901 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing preambentryle near line %1: Could not read entry id")).arg(state.lineNo))); 0902 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0903 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing preambentryle near line %1: Could not read entry id")).arg(state.lineNo))); 0904 #endif 0905 return nullptr; 0906 } 0907 } else { 0908 if (id.contains(QStringLiteral("\\")) || id.contains(QStringLiteral("{"))) { 0909 const QString newId = EncoderLaTeX::instance().decode(id); 0910 qCWarning(LOG_KBIBTEX_IO) << "Entry id" << id << "near line" << state.lineNo << "contains backslashes or curly brackets, converted to" << newId; 0911 /// Instead of an 'emit' ... 0912 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0913 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Entry id '%1' near line %2 contains backslashes or curly brackets, converted to '%3'")).arg(id).arg(state.lineNo).arg(newId))); 0914 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0915 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Entry id '%1' near line %2 contains backslashes or curly brackets, converted to '%3'")).arg(id).arg(state.lineNo).arg(newId))); 0916 #endif 0917 id = newId; 0918 } 0919 if (!Encoder::containsOnlyAscii(id)) { 0920 /// Try to avoid non-ascii characters in ids 0921 const QString newId = Encoder::instance().convertToPlainAscii(id); 0922 qCWarning(LOG_KBIBTEX_IO) << "Entry id" << id << "near line" << state.lineNo << "contains non-ASCII characters, converted to" << newId; 0923 /// Instead of an 'emit' ... 0924 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0925 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Entry id '%1' near line %2 contains non-ASCII characters, converted to '%3'")).arg(id).arg(state.lineNo).arg(newId))); 0926 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0927 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Entry id '%1' near line %2 contains non-ASCII characters, converted to '%3'")).arg(id).arg(state.lineNo).arg(newId))); 0928 #endif 0929 id = newId; 0930 } 0931 } 0932 static const QVector<QChar> invalidIdCharacters = {QLatin1Char('{'), QLatin1Char('}'), QLatin1Char(',')}; 0933 for (const QChar &invalidIdCharacter : invalidIdCharacters) 0934 if (id.contains(invalidIdCharacter)) { 0935 qCWarning(LOG_KBIBTEX_IO) << "Entry id" << id << "near line" << state.lineNo << "contains invalid character" << invalidIdCharacter; 0936 /// Instead of an 'emit' ... 0937 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0938 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Entry id '%1' near line %2 contains invalid character '%3'")).arg(id).arg(state.lineNo).arg(invalidIdCharacter))); 0939 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0940 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Entry id '%1' near line %2 contains invalid character '%3'")).arg(id).arg(state.lineNo).arg(invalidIdCharacter))); 0941 #endif 0942 return nullptr; 0943 } 0944 0945 /// Check for duplicate entry ids, avoid collisions 0946 if (state.knownElementIds.contains(id)) { 0947 static const QString newIdPattern = QStringLiteral("%1-%2"); 0948 int idx = 2; 0949 QString newId = newIdPattern.arg(id).arg(idx); 0950 while (state.knownElementIds.contains(newId)) 0951 newId = newIdPattern.arg(id).arg(++idx); 0952 qCDebug(LOG_KBIBTEX_IO) << "Duplicate id" << id << "near line" << state.lineNo << ", using replacement id" << newId; 0953 /// Instead of an 'emit' ... 0954 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0955 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Duplicate id '%1' near line %2, using replacement id '%3'")).arg(id).arg(state.lineNo).arg(newId))); 0956 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0957 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Duplicate id '%1' near line %2, using replacement id '%3'")).arg(id).arg(state.lineNo).arg(newId))); 0958 #endif 0959 id = newId; 0960 } 0961 state.knownElementIds.insert(id); 0962 0963 Entry *entry = new Entry(BibTeXEntries::instance().format(typeString), id); 0964 0965 token = nextToken(state); 0966 do { 0967 if (token == Token::BracketClose) 0968 break; 0969 else if (token == Token::EndOfFile) { 0970 #if QT_VERSION >= 0x050e00 0971 qCWarning(LOG_KBIBTEX_IO) << "Unexpected end of data in entry" << id << "near line" << state.lineNo << ":" << state.prevLine << Qt::endl << state.currentLine; 0972 #else // QT_VERSION < 0x050e00 0973 qCWarning(LOG_KBIBTEX_IO) << "Unexpected end of data in entry" << id << "near line" << state.lineNo << ":" << state.prevLine << endl << state.currentLine; 0974 #endif // QT_VERSION >= 0x050e00 0975 /// Instead of an 'emit' ... 0976 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0977 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Unexpected end of data in entry '%1' near line %2")).arg(id).arg(state.lineNo))); 0978 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0979 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Unexpected end of data in entry '%1' near line %2")).arg(id).arg(state.lineNo))); 0980 #endif 0981 delete entry; 0982 return nullptr; 0983 } else if (token != Token::Comma) { 0984 if (state.nextChar.isLetter()) { 0985 #if QT_VERSION >= 0x050e00 0986 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Comma symbol ',' expected but got character" << state.nextChar << "(token" << tokenidToString(token) << ")"; 0987 #else // QT_VERSION < 0x050e00 0988 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Comma symbol ',' expected but got character" << state.nextChar << "(token" << tokenidToString(token) << ")"; 0989 #endif // QT_VERSION >= 0x050e00 0990 /// Instead of an 'emit' ... 0991 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 0992 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Comma symbol ',' expected but got character '%3' (token %4)")).arg(id).arg(state.lineNo).arg(state.nextChar).arg(tokenidToString(token)))); 0993 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 0994 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Comma symbol ',' expected but got character '%3' (token %4)")).arg(id).arg(state.lineNo).arg(state.nextChar).arg(tokenidToString(token)))); 0995 #endif 0996 } else if (state.nextChar.isPrint()) { 0997 #if QT_VERSION >= 0x050e00 0998 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Comma symbol ',' expected but got character" << state.nextChar << "(" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << ", token" << tokenidToString(token) << ")"; 0999 #else // QT_VERSION < 0x050e00 1000 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Comma symbol ',' expected but got character" << state.nextChar << "(" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << ", token" << tokenidToString(token) << ")"; 1001 #endif // QT_VERSION >= 0x050e00 1002 /// Instead of an 'emit' ... 1003 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1004 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Comma symbol ',' expected but got character '%3' (0x%4, token %5)")).arg(id).arg(state.lineNo).arg(state.nextChar).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(tokenidToString(token)))); 1005 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1006 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Comma symbol ',' expected but got character '%3' (0x%4, token %5)")).arg(id).arg(state.lineNo).arg(state.nextChar).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(tokenidToString(token)))); 1007 #endif 1008 } else { 1009 #if QT_VERSION >= 0x050e00 1010 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Comma symbol (,) expected but got character" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << "(token" << tokenidToString(token) << ")"; 1011 #else // QT_VERSION < 0x050e00 1012 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Comma symbol (,) expected but got character" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << "(token" << tokenidToString(token) << ")"; 1013 #endif // QT_VERSION >= 0x050e00 1014 /// Instead of an 'emit' ... 1015 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1016 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Comma symbol ',' expected but got character 0x%3 (token %4)")).arg(id).arg(state.lineNo).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(tokenidToString(token)))); 1017 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1018 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Comma symbol ',' expected but got character 0x%3 (token %4)")).arg(id).arg(state.lineNo).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(tokenidToString(token)))); 1019 #endif 1020 } 1021 delete entry; 1022 return nullptr; 1023 } 1024 1025 QString keyName = BibTeXFields::instance().format(readSimpleString(state), keywordCasing); 1026 if (keyName.isEmpty()) { 1027 token = nextToken(state); 1028 if (token == Token::BracketClose) { 1029 /// Most often it is the case that the previous line ended with a comma, 1030 /// implying that this entry continues, but instead it gets closed by 1031 /// a closing curly bracket. 1032 #if QT_VERSION >= 0x050e00 1033 qCDebug(LOG_KBIBTEX_IO) << "Issue while parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Last key-value pair ended with a non-conformant comma, ignoring that"; 1034 #else // QT_VERSION < 0x050e00 1035 qCDebug(LOG_KBIBTEX_IO) << "Issue while parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Last key-value pair ended with a non-conformant comma, ignoring that"; 1036 #endif // QT_VERSION >= 0x050e00 1037 /// Instead of an 'emit' ... 1038 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1039 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Issue while parsing entry '%1' near line %2: Last key-value pair ended with a non-conformant comma, ignoring that")).arg(id).arg(state.lineNo))); 1040 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1041 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Issue while parsing entry '%1' near line %2: Last key-value pair ended with a non-conformant comma, ignoring that")).arg(id).arg(state.lineNo))); 1042 #endif 1043 break; 1044 } else { 1045 /// Something looks terribly wrong 1046 #if QT_VERSION >= 0x050e00 1047 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Closing curly bracket expected, but found" << tokenidToString(token); 1048 #else // QT_VERSION < 0x050e00 1049 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Closing curly bracket expected, but found" << tokenidToString(token); 1050 #endif // QT_VERSION >= 0x050e00 1051 /// Instead of an 'emit' ... 1052 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1053 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Closing curly bracket expected, but found %3")).arg(id).arg(state.lineNo).arg(tokenidToString(token)))); 1054 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1055 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1' near line %2: Closing curly bracket expected, but found %3")).arg(id).arg(state.lineNo).arg(tokenidToString(token)))); 1056 #endif 1057 delete entry; 1058 return nullptr; 1059 } 1060 } 1061 /// Try to avoid non-ascii characters in keys 1062 const QString newkeyName = Encoder::instance().convertToPlainAscii(keyName); 1063 if (newkeyName != keyName) { 1064 qCWarning(LOG_KBIBTEX_IO) << "Field name " << keyName << "near line" << state.lineNo << "contains non-ASCII characters, converted to" << newkeyName; 1065 /// Instead of an 'emit' ... 1066 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1067 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Field name '%1' near line %2 contains non-ASCII characters, converted to '%3'")).arg(keyName).arg(state.lineNo).arg(newkeyName))); 1068 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1069 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Field name '%1' near line %2 contains non-ASCII characters, converted to '%3'")).arg(keyName).arg(state.lineNo).arg(newkeyName))); 1070 #endif 1071 keyName = newkeyName; 1072 } 1073 1074 token = nextToken(state); 1075 if (token != Token::Assign) { 1076 #if QT_VERSION >= 0x050e00 1077 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << ", field name" << keyName << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "): Assign symbol '=' expected after field name"; 1078 #else // QT_VERSION < 0x050e00 1079 qCWarning(LOG_KBIBTEX_IO) << "Error in parsing entry" << id << ", field name" << keyName << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "): Assign symbol '=' expected after field name"; 1080 #endif // QT_VERSION >= 0x050e00 1081 /// Instead of an 'emit' ... 1082 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1083 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1', field name '%2' near line %3: Assign symbol '=' expected after field name")).arg(id, keyName).arg(state.lineNo))); 1084 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1085 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Error in parsing entry '%1', field name '%2' near line %3: Assign symbol '=' expected after field name")).arg(id, keyName).arg(state.lineNo))); 1086 #endif 1087 delete entry; 1088 return nullptr; 1089 } 1090 1091 Value value; 1092 1093 /// check for duplicate fields 1094 if (entry->contains(keyName)) { 1095 if (keyName.toLower() == Entry::ftKeywords || keyName.toLower() == Entry::ftUrl) { 1096 /// Special handling of keywords and URLs: instead of using fallback names 1097 /// like "keywords2", "keywords3", ..., append new keywords to 1098 /// already existing keyword value 1099 value = entry->value(keyName); 1100 } else if (keysForPersonDetection.contains(keyName.toLower())) { 1101 /// Special handling of authors and editors: instead of using fallback names 1102 /// like "author2", "author3", ..., append new authors to 1103 /// already existing author value 1104 value = entry->value(keyName); 1105 } else { 1106 int i = 2; 1107 QString appendix = QString::number(i); 1108 while (entry->contains(keyName + appendix)) { 1109 ++i; 1110 appendix = QString::number(i); 1111 } 1112 #if QT_VERSION >= 0x050e00 1113 qCDebug(LOG_KBIBTEX_IO) << "Entry" << id << "already contains a key" << keyName << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << "), using" << (keyName + appendix); 1114 #else // QT_VERSION < 0x050e00 1115 qCDebug(LOG_KBIBTEX_IO) << "Entry" << id << "already contains a key" << keyName << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << "), using" << (keyName + appendix); 1116 #endif // QT_VERSION >= 0x050e00 1117 /// Instead of an 'emit' ... 1118 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1119 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Entry '%1' already contains a key '%2' near line %4, using '%3'")).arg(id, keyName, keyName + appendix).arg(state.lineNo))); 1120 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1121 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Entry '%1' already contains a key '%2' near line %4, using '%3'")).arg(id, keyName, keyName + appendix).arg(state.lineNo))); 1122 #endif 1123 keyName += appendix; 1124 } 1125 } 1126 1127 token = readValue(value, keyName, statistics, state); 1128 if (token != Token::BracketClose && token != Token::Comma) { 1129 #if QT_VERSION >= 0x050e00 1130 qCWarning(LOG_KBIBTEX_IO) << "Failed to read value in entry" << id << ", field name" << keyName << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << ")"; 1131 #else // QT_VERSION < 0x050e00 1132 qCWarning(LOG_KBIBTEX_IO) << "Failed to read value in entry" << id << ", field name" << keyName << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << ")"; 1133 #endif // QT_VERSION >= 0x050e00 1134 /// Instead of an 'emit' ... 1135 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1136 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Failed to read value in entry '%1', field name '%2' near line %3")).arg(id, keyName).arg(state.lineNo))); 1137 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1138 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Failed to read value in entry '%1', field name '%2' near line %3")).arg(id, keyName).arg(state.lineNo))); 1139 #endif 1140 delete entry; 1141 return nullptr; 1142 } 1143 1144 entry->insert(keyName, value); 1145 } while (true); 1146 1147 return entry; 1148 } 1149 1150 Element *nextElement(Statistics &statistics, State &state) 1151 { 1152 Token token = nextToken(state); 1153 1154 if (token == Token::At) { 1155 const QString elementType = readSimpleString(state); 1156 const QString elementTypeLower = elementType.toLower(); 1157 1158 if (elementTypeLower == QStringLiteral("comment")) { 1159 Comment *comment {readCommentElement(state)}; 1160 if (comment != nullptr) 1161 statistics.countCommentContext.insert(QStringLiteral("@"), statistics.countCommentContext.value(QStringLiteral("@"), 0) + 1); 1162 return comment; 1163 } else if (elementTypeLower == QStringLiteral("string")) 1164 return readMacroElement(statistics, state); 1165 else if (elementTypeLower == QStringLiteral("preamble")) 1166 return readPreambleElement(statistics, state); 1167 else if (elementTypeLower == QStringLiteral("import")) { 1168 qCDebug(LOG_KBIBTEX_IO) << "Skipping potential HTML/JavaScript @import statement near line" << state.lineNo; 1169 /// Instead of an 'emit' ... 1170 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1171 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Skipping potential HTML/JavaScript @import statement near line %1")).arg(state.lineNo))); 1172 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1173 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Skipping potential HTML/JavaScript @import statement near line %1")).arg(state.lineNo))); 1174 #endif 1175 return nullptr; 1176 } else if (!elementType.isEmpty()) 1177 return readEntryElement(elementType, statistics, state); 1178 else { 1179 qCWarning(LOG_KBIBTEX_IO) << "Element type after '@' is empty or invalid near line" << state.lineNo; 1180 /// Instead of an 'emit' ... 1181 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1182 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Element type after '@' is empty or invalid near line %1")).arg(state.lineNo))); 1183 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1184 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Element type after '@' is empty or invalid near line %1")).arg(state.lineNo))); 1185 #endif 1186 return nullptr; 1187 } 1188 } else if (token == Token::Unknown && state.nextChar == QLatin1Char('%')) { 1189 // Do not complain about LaTeX-like comments, just eat them 1190 Comment *comment {readPlainCommentElement(QString(state.nextChar), state)}; 1191 if (comment != nullptr) 1192 statistics.countCommentContext.insert(comment->prefix(), statistics.countCommentContext.value(comment->prefix(), 0) + 1); 1193 return comment; 1194 } else if (token == Token::Unknown) { 1195 if (state.nextChar.isLetter()) { 1196 #if QT_VERSION >= 0x050e00 1197 qCDebug(LOG_KBIBTEX_IO) << "Unknown character" << state.nextChar << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << ")" << ", treating as comment"; 1198 #else // QT_VERSION < 0x050e00 1199 qCDebug(LOG_KBIBTEX_IO) << "Unknown character" << state.nextChar << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << ")" << ", treating as comment"; 1200 #endif // QT_VERSION >= 0x050e00 1201 /// Instead of an 'emit' ... 1202 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1203 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Unknown character '%1' near line %2, treating as comment")).arg(state.nextChar).arg(state.lineNo))); 1204 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1205 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Unknown character '%1' near line %2, treating as comment")).arg(state.nextChar).arg(state.lineNo))); 1206 #endif 1207 } else if (state.nextChar.isPrint()) { 1208 #if QT_VERSION >= 0x050e00 1209 qCDebug(LOG_KBIBTEX_IO) << "Unknown character" << state.nextChar << "(" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << ") near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << ")" << ", treating as comment"; 1210 #else // QT_VERSION < 0x050e00 1211 qCDebug(LOG_KBIBTEX_IO) << "Unknown character" << state.nextChar << "(" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << ") near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << ")" << ", treating as comment"; 1212 #endif // QT_VERSION >= 0x050e00 1213 /// Instead of an 'emit' ... 1214 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1215 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Unknown character '%1' (0x%2) near line %3, treating as comment")).arg(state.nextChar).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(state.lineNo))); 1216 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1217 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Unknown character '%1' (0x%2) near line %3, treating as comment")).arg(state.nextChar).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(state.lineNo))); 1218 #endif 1219 } else { 1220 #if QT_VERSION >= 0x050e00 1221 qCDebug(LOG_KBIBTEX_IO) << "Unknown character" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << "near line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << ")" << ", treating as comment"; 1222 #else // QT_VERSION < 0x050e00 1223 qCDebug(LOG_KBIBTEX_IO) << "Unknown character" << QString(QStringLiteral("0x%1")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')) << "near line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << ")" << ", treating as comment"; 1224 #endif // QT_VERSION >= 0x050e00 1225 /// Instead of an 'emit' ... 1226 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1227 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Unknown character 0x%1 near line %2, treating as comment")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(state.lineNo))); 1228 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1229 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Info), Q_ARG(QString, QString(QStringLiteral("Unknown character 0x%1 near line %2, treating as comment")).arg(state.nextChar.unicode(), 4, 16, QLatin1Char('0')).arg(state.lineNo))); 1230 #endif 1231 } 1232 1233 Comment *comment {readPlainCommentElement(QString(state.prevChar) + state.nextChar, state)}; 1234 if (comment != nullptr) 1235 statistics.countCommentContext.insert(QString(), statistics.countCommentContext.value(QString(), 0) + 1); 1236 return comment; 1237 } 1238 1239 if (token != Token::EndOfFile) { 1240 #if QT_VERSION >= 0x050e00 1241 qCWarning(LOG_KBIBTEX_IO) << "Don't know how to parse next token of type" << tokenidToString(token) << "in line" << state.lineNo << "(" << state.prevLine << Qt::endl << state.currentLine << ")"; 1242 #else // QT_VERSION < 0x050e00 1243 qCWarning(LOG_KBIBTEX_IO) << "Don't know how to parse next token of type" << tokenidToString(token) << "in line" << state.lineNo << "(" << state.prevLine << endl << state.currentLine << ")"; 1244 #endif // QT_VERSION >= 0x050e00 1245 /// Instead of an 'emit' ... 1246 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1247 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Don't know how to parse next token of type %1 in line %2")).arg(tokenidToString(token)).arg(state.lineNo))); 1248 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1249 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Error), Q_ARG(QString, QString(QStringLiteral("Don't know how to parse next token of type %1 in line %2")).arg(tokenidToString(token)).arg(state.lineNo))); 1250 #endif 1251 } 1252 1253 return nullptr; 1254 } 1255 1256 1257 static QSharedPointer<Person> personFromString(const QString &name, CommaContainment *comma, const int line_number, QObject *parent) 1258 { 1259 // TODO Merge with FileImporter::splitName and FileImporterBibTeX::contextSensitiveSplit 1260 static QStringList tokens; 1261 contextSensitiveSplit(name, tokens); 1262 return personFromTokenList(tokens, comma, line_number, parent); 1263 } 1264 1265 static QSharedPointer<Person> personFromTokenList(const QStringList &tokens, CommaContainment *comma, const int line_number, QObject *parent) 1266 { 1267 if (comma != nullptr) *comma = CommaContainment::None; 1268 1269 /// Simple case: provided list of tokens is empty, return invalid Person 1270 if (tokens.isEmpty()) 1271 return QSharedPointer<Person>(); 1272 1273 /** 1274 * The sequence of tokens may contain in up to two of its elements one comma each: 1275 * {"Tuckwell,", "Peter,", "Jr."}. In this case, fill three string lists: 1276 * one with tokens before the first comma, one with tokens after the second commas, 1277 * and one with tokens after the second commas. If commas appear in the middle of a 1278 * token, split token into two new tokens and add them to two different string lists. 1279 * The comma itself will not be part of any string in the string lists. 1280 * Example: 1281 * partA = ( "Tuckwell" ); partB = ( "Peter" ); partC = ( "Jr." ) 1282 * If a comma was found, boolean variable gotComma is set. 1283 */ 1284 QStringList partA, partB, partC; 1285 int commaCount = 0; 1286 for (const QString &token : tokens) { 1287 /// Position where comma was found, or -1 if no comma in token 1288 int p = -1; 1289 if (commaCount < 2) { 1290 /// Only check if token contains comma 1291 /// if no comma was found before 1292 int bracketCounter = 0; 1293 for (int i = 0; i < token.length(); ++i) { 1294 /// Consider opening curly brackets 1295 if (token[i] == QLatin1Char('{')) ++bracketCounter; 1296 /// Consider closing curly brackets 1297 else if (token[i] == QLatin1Char('}')) --bracketCounter; 1298 /// Only if outside any open curly bracket environments 1299 /// consider comma characters 1300 else if (bracketCounter == 0 && token[i] == QLatin1Char(',')) { 1301 /// Memorize comma's position and break from loop 1302 p = i; 1303 break; 1304 } else if (bracketCounter < 0) { 1305 /// Should never happen: more closing brackets than opening ones 1306 qCWarning(LOG_KBIBTEX_IO) << "Opening and closing brackets do not match near line" << line_number; 1307 if (parent != nullptr) 1308 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1309 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Opening and closing brackets do not match near line %1")).arg(line_number))); 1310 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1311 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Opening and closing brackets do not match near line %1")).arg(line_number))); 1312 #endif 1313 } 1314 } 1315 } 1316 1317 if (p >= 0) { 1318 if (commaCount == 0) { 1319 if (p > 0) partA.append(token.left(p)); 1320 if (p < token.length() - 1) partB.append(token.mid(p + 1)); 1321 } else if (commaCount == 1) { 1322 if (p > 0) partB.append(token.left(p)); 1323 if (p < token.length() - 1) partC.append(token.mid(p + 1)); 1324 } 1325 ++commaCount; 1326 } else if (commaCount == 0) 1327 partA.append(token); 1328 else if (commaCount == 1) 1329 partB.append(token); 1330 else if (commaCount == 2) 1331 partC.append(token); 1332 } 1333 if (commaCount > 0) { 1334 if (comma != nullptr) *comma = CommaContainment::Contains; 1335 return QSharedPointer<Person>(new Person(partC.isEmpty() ? partB.join(QLatin1Char(' ')) : partC.join(QLatin1Char(' ')), partA.join(QLatin1Char(' ')), partC.isEmpty() ? QString() : partB.join(QLatin1Char(' ')))); 1336 } 1337 1338 /** 1339 * PubMed uses a special writing style for names, where the 1340 * last name is followed by single capital letters, each being 1341 * the first letter of each first name. Example: Tuckwell P H 1342 * So, check how many single capital letters are at the end of 1343 * the given token list 1344 */ 1345 partA.clear(); partB.clear(); 1346 bool singleCapitalLetters = true; 1347 QStringList::ConstIterator it = tokens.constEnd(); 1348 while (it != tokens.constBegin()) { 1349 --it; 1350 if (singleCapitalLetters && it->length() == 1 && it->at(0).isUpper()) 1351 partB.prepend(*it); 1352 else { 1353 singleCapitalLetters = false; 1354 partA.prepend(*it); 1355 } 1356 } 1357 if (!partB.isEmpty()) { 1358 /// Name was actually given in PubMed format 1359 return QSharedPointer<Person>(new Person(partB.join(QLatin1Char(' ')), partA.join(QLatin1Char(' ')))); 1360 } 1361 1362 /** 1363 * Normally, the last upper case token in a name is the last name 1364 * (last names consisting of multiple space-separated parts *have* 1365 * to be protected by {...}), but some languages have fill words 1366 * in lower case belonging to the last name as well (example: "van"). 1367 * In addition, some languages have capital case letters as well 1368 * (example: "Di Cosmo"). 1369 * Exception: Special keywords such as "Jr." can be appended to the 1370 * name, not counted as part of the last name. 1371 */ 1372 partA.clear(); partB.clear(); partC.clear(); 1373 static const QSet<QString> capitalCaseLastNameFragments {QStringLiteral("Di")}; 1374 it = tokens.constEnd(); 1375 while (it != tokens.constBegin()) { 1376 --it; 1377 if (partB.isEmpty() && (it->toLower().startsWith(QStringLiteral("jr")) || it->toLower().startsWith(QStringLiteral("sr")) || it->toLower().startsWith(QStringLiteral("iii")))) 1378 /// handle name suffices like "Jr" or "III." 1379 partC.prepend(*it); 1380 else if (partB.isEmpty() || it->at(0).isLower() || capitalCaseLastNameFragments.contains(*it)) 1381 partB.prepend(*it); 1382 else 1383 partA.prepend(*it); 1384 } 1385 if (!partB.isEmpty()) { 1386 /// Name was actually like "Peter Ole van der Tuckwell", 1387 /// split into "Peter Ole" and "van der Tuckwell" 1388 return QSharedPointer<Person>(new Person(partA.join(QLatin1Char(' ')), partB.join(QLatin1Char(' ')), partC.isEmpty() ? QString() : partC.join(QLatin1Char(' ')))); 1389 } 1390 1391 qCWarning(LOG_KBIBTEX_IO) << "Don't know how to handle name" << tokens.join(QLatin1Char(' ')) << "near line" << line_number; 1392 if (parent != nullptr) 1393 #if QT_VERSION < QT_VERSION_CHECK(6, 5, 0) 1394 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QGenericReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Don't know how to handle name '%1' near line %2")).arg(tokens.join(QLatin1Char(' '))).arg(line_number))); 1395 #else // QT_VERSION >= QT_VERSION_CHECK(6, 5, 0) 1396 QMetaObject::invokeMethod(parent, "message", Qt::DirectConnection, QMetaMethodReturnArgument(), Q_ARG(FileImporter::MessageSeverity, MessageSeverity::Warning), Q_ARG(QString, QString(QStringLiteral("Don't know how to handle name '%1' near line %2")).arg(tokens.join(QLatin1Char(' '))).arg(line_number))); 1397 #endif 1398 return QSharedPointer<Person>(); 1399 } 1400 1401 }; 1402 1403 const QStringList FileImporterBibTeX::Private::keysForPersonDetection {Entry::ftAuthor, Entry::ftEditor, QStringLiteral("bookauthor") /** used by JSTOR */}; 1404 1405 1406 FileImporterBibTeX::FileImporterBibTeX(QObject *parent) 1407 : FileImporter(parent), d(new Private(this)), m_cancelFlag(false) 1408 { 1409 /// nothing 1410 } 1411 1412 FileImporterBibTeX::~FileImporterBibTeX() 1413 { 1414 delete d; 1415 } 1416 1417 File *FileImporterBibTeX::fromString(const QString &rawText) 1418 { 1419 if (rawText.isEmpty()) { 1420 qCInfo(LOG_KBIBTEX_IO) << "BibTeX data converted to string is empty"; 1421 Q_EMIT message(MessageSeverity::Warning, QStringLiteral("BibTeX data converted to string is empty")); 1422 return new File(); 1423 } 1424 1425 File *result = new File(); 1426 1427 /** Remove HTML code from the input source */ 1428 // FIXME HTML data should be removed somewhere else? onlinesearch ... 1429 const int originalLength = rawText.length(); 1430 QString internalRawText = rawText; 1431 internalRawText = internalRawText.remove(KBibTeX::htmlRegExp); 1432 const int afterHTMLremovalLength = internalRawText.length(); 1433 if (originalLength != afterHTMLremovalLength) { 1434 qCInfo(LOG_KBIBTEX_IO) << (originalLength - afterHTMLremovalLength) << "characters of HTML tags have been removed"; 1435 Q_EMIT message(MessageSeverity::Info, QString(QStringLiteral("%1 characters of HTML tags have been removed")).arg(originalLength - afterHTMLremovalLength)); 1436 } 1437 1438 Private::Statistics statistics; 1439 Private::State state(new QTextStream(&internalRawText, QIODevice::ReadOnly)); 1440 d->readChar(state); 1441 1442 bool gotAtLeastOneElement = false; 1443 QString previousEntryId; 1444 while (!state.nextChar.isNull() && !m_cancelFlag && !state.textStream->atEnd()) { 1445 Q_EMIT progress(qint64toint(state.textStream->pos()), internalRawText.length()); 1446 Element *element = d->nextElement(statistics, state); 1447 1448 if (element != nullptr) { 1449 gotAtLeastOneElement = true; 1450 if (d->commentHandling == CommentHandling::Keep || !Comment::isComment(*element)) { 1451 result->append(QSharedPointer<Element>(element)); 1452 1453 Entry *currentEntry = dynamic_cast<Entry *>(element); 1454 if (currentEntry != nullptr) { 1455 if (!previousEntryId.isEmpty()) { 1456 if (currentEntry->id() >= previousEntryId) 1457 ++statistics.countSortedByIdentifier; 1458 else 1459 ++statistics.countNotSortedByIdentifier; 1460 } 1461 previousEntryId = currentEntry->id(); 1462 } 1463 } else 1464 delete element; 1465 } 1466 } 1467 1468 if (!gotAtLeastOneElement) { 1469 qCWarning(LOG_KBIBTEX_IO) << "In non-empty input, did not find a single BibTeX element"; 1470 Q_EMIT message(MessageSeverity::Error, QStringLiteral("In non-empty input, did not find a single BibTeX element")); 1471 delete result; 1472 result = nullptr; 1473 } 1474 1475 Q_EMIT progress(100, 100); 1476 1477 if (m_cancelFlag) { 1478 qCWarning(LOG_KBIBTEX_IO) << "Loading bibliography data has been canceled"; 1479 Q_EMIT message(MessageSeverity::Error, QStringLiteral("Loading bibliography data has been canceled")); 1480 delete result; 1481 result = nullptr; 1482 } 1483 1484 delete state.textStream; 1485 1486 if (result != nullptr) { 1487 /// Set the file's preferences for string delimiters 1488 /// deduced from statistics built while parsing the file 1489 result->setProperty(File::StringDelimiter, statistics.countQuotationMarks > statistics.countCurlyBrackets ? QStringLiteral("\"\"") : QStringLiteral("{}")); 1490 /// Set the file's preferences for name formatting 1491 result->setProperty(File::NameFormatting, statistics.countFirstNameFirst > statistics.countLastNameFirst ? Preferences::personNameFormatFirstLast : Preferences::personNameFormatLastFirst); 1492 /// Set the file's preferences for title protected 1493 Qt::CheckState triState = (statistics.countProtectedTitle > statistics.countUnprotectedTitle * 4) ? Qt::Checked : ((statistics.countProtectedTitle * 4 < statistics.countUnprotectedTitle) ? Qt::Unchecked : Qt::PartiallyChecked); 1494 result->setProperty(File::ProtectCasing, static_cast<int>(triState)); 1495 // Set the file's preferences for comment context 1496 QString commentContextMapKey; 1497 int commentContextMapValue = -1; 1498 for (QHash<QString, int>::ConstIterator it = statistics.countCommentContext.constBegin(); it != statistics.countCommentContext.constEnd(); ++it) 1499 if (it.value() > commentContextMapValue) { 1500 commentContextMapKey = it.key(); 1501 commentContextMapValue = it.value(); 1502 } 1503 if (commentContextMapValue < 0) { 1504 // No comments in BibTeX file? Use value from Preferences ... 1505 result->setProperty(File::CommentContext, static_cast<int>(Preferences::instance().bibTeXCommentContext())); 1506 result->setProperty(File::CommentPrefix, Preferences::instance().bibTeXCommentPrefix()); 1507 } else if (commentContextMapKey == QStringLiteral("@")) { 1508 result->setProperty(File::CommentContext, static_cast<int>(Preferences::CommentContext::Command)); 1509 result->setProperty(File::CommentPrefix, QString()); 1510 } else if (commentContextMapKey.isEmpty()) { 1511 result->setProperty(File::CommentContext, static_cast<int>(Preferences::CommentContext::Verbatim)); 1512 result->setProperty(File::CommentPrefix, QString()); 1513 } else { 1514 result->setProperty(File::CommentContext, static_cast<int>(Preferences::CommentContext::Prefix)); 1515 result->setProperty(File::CommentPrefix, commentContextMapKey); 1516 } 1517 if (!statistics.mostRecentListSeparator.isEmpty()) 1518 result->setProperty(File::ListSeparator, statistics.mostRecentListSeparator); 1519 /// Set the file's preference to have the entries sorted by identifier 1520 result->setProperty(File::SortedByIdentifier, statistics.countSortedByIdentifier >= statistics.countNotSortedByIdentifier * 10); 1521 // TODO gather more statistics for keyword casing etc. 1522 } 1523 1524 return result; 1525 } 1526 1527 File *FileImporterBibTeX::load(QIODevice *iodevice) 1528 { 1529 m_cancelFlag = false; 1530 1531 check_if_iodevice_invalid(iodevice); 1532 1533 QByteArray rawData = iodevice->readAll(); 1534 iodevice->close(); 1535 1536 bool encodingMayGetDeterminedByRawData = true; 1537 QString encoding(Preferences::instance().bibTeXEncoding()); ///< default value taken from Preferences 1538 if (rawData.length() >= 8 && rawData.at(0) != 0 && rawData.at(1) == 0 && rawData.at(2) == 0 && rawData.at(3) == 0 && rawData.at(4) != 0 && rawData.at(5) == 0 && rawData.at(6) == 0 && rawData.at(7) == 0) { 1539 /// UTF-32LE (Little Endian) 1540 encoding = QStringLiteral("UTF-32LE"); 1541 encodingMayGetDeterminedByRawData = false; 1542 } else if (rawData.length() >= 4 && static_cast<unsigned char>(rawData.at(0)) == 0xff && static_cast<unsigned char>(rawData.at(1)) == 0xfe && rawData.at(2) == 0 && rawData.at(3) == 0) { 1543 /// UTF-32LE (Little Endian) with BOM 1544 encoding = QStringLiteral("UTF-32LE"); 1545 rawData = rawData.mid(4); ///< skip BOM 1546 encodingMayGetDeterminedByRawData = false; 1547 } else if (rawData.length() >= 8 && rawData.at(0) == 0 && rawData.at(1) == 0 && rawData.at(2) == 0 && rawData.at(3) != 0 && rawData.at(4) == 0 && rawData.at(5) == 0 && rawData.at(6) == 0 && rawData.at(7) != 0) { 1548 /// UTF-32BE (Big Endian) 1549 encoding = QStringLiteral("UTF-32BE"); 1550 encodingMayGetDeterminedByRawData = false; 1551 } else if (rawData.length() >= 4 && static_cast<unsigned char>(rawData.at(0)) == 0 && static_cast<unsigned char>(rawData.at(1)) == 0 && static_cast<unsigned char>(rawData.at(2)) == 0xfe && static_cast<unsigned char>(rawData.at(3)) == 0xff) { 1552 /// UTF-32BE (Big Endian) with BOM 1553 encoding = QStringLiteral("UTF-32BE"); 1554 rawData = rawData.mid(4); ///< skip BOM 1555 encodingMayGetDeterminedByRawData = false; 1556 } else if (rawData.length() >= 6 && rawData.at(0) != 0 && rawData.at(1) == 0 && rawData.at(2) != 0 && rawData.at(3) == 0 && rawData.at(4) != 0 && rawData.at(5) == 0) { 1557 /// UTF-16LE (Little Endian) 1558 encoding = QStringLiteral("UTF-16LE"); 1559 encodingMayGetDeterminedByRawData = false; 1560 } else if (rawData.length() >= 4 && static_cast<unsigned char>(rawData.at(0)) == 0xff && static_cast<unsigned char>(rawData.at(1)) == 0xfe && rawData.at(2) != 0 && rawData.at(3) == 0) { 1561 /// UTF-16LE (Little Endian) with BOM 1562 encoding = QStringLiteral("UTF-16LE"); 1563 rawData = rawData.mid(2); ///< skip BOM 1564 encodingMayGetDeterminedByRawData = false; 1565 } else if (rawData.length() >= 6 && rawData.at(0) == 0 && rawData.at(1) != 0 && rawData.at(2) == 0 && rawData.at(3) != 0 && rawData.at(4) == 0 && rawData.at(5) != 0) { 1566 /// UTF-16BE (Big Endian) 1567 encoding = QStringLiteral("UTF-16BE"); 1568 encodingMayGetDeterminedByRawData = false; 1569 } else if (rawData.length() >= 4 && static_cast<unsigned char>(rawData.at(0)) == 0xfe && static_cast<unsigned char>(rawData.at(1)) == 0xff && rawData.at(2) == 0 && rawData.at(3) != 0) { 1570 /// UTF-16BE (Big Endian) with BOM 1571 encoding = QStringLiteral("UTF-16BE"); 1572 rawData = rawData.mid(2); ///< skip BOM 1573 encodingMayGetDeterminedByRawData = false; 1574 } else if (rawData.length() >= 3 && static_cast<unsigned char>(rawData.at(0)) == 0xef && static_cast<unsigned char>(rawData.at(1)) == 0xbb && static_cast<unsigned char>(rawData.at(2)) == 0xbf) { 1575 /// UTF-8 BOM 1576 encoding = QStringLiteral("UTF-8"); 1577 rawData = rawData.mid(3); ///< skip BOM 1578 encodingMayGetDeterminedByRawData = false; 1579 } else { 1580 /// Assuming that encoding is ASCII-compatible, thus it is possible 1581 /// to search for a byte sequence containin ASCII text 1582 const QByteArray rawDataBeginning = rawData.left(8192); 1583 const int xkbibtexencodingpos = qMax(rawDataBeginning.indexOf("@comment{x-kbibtex-encoding="), rawDataBeginning.indexOf("@Comment{x-kbibtex-encoding=")); 1584 if (xkbibtexencodingpos >= 0) { 1585 int i = xkbibtexencodingpos + 28, l = 0; 1586 encoding.clear(); 1587 encoding.reserve(32); 1588 while (l < 32 && rawData.at(i) >= 0x20 && rawData.at(i) != QLatin1Char('\n') && rawData.at(i) != QLatin1Char('\r') && rawData.at(i) != QLatin1Char('}') && rawData.at(i) != QLatin1Char(')') && static_cast<unsigned char>(rawData.at(i)) < 0x80) { 1589 encoding.append(QLatin1Char(rawData.at(i))); 1590 ++i; 1591 ++l; 1592 } 1593 rawData = rawData.left(xkbibtexencodingpos) + rawData.mid(i + 1); ///< remove encoding comment 1594 encodingMayGetDeterminedByRawData = encoding.isEmpty(); 1595 } else { 1596 const int jabrefencodingpos = qMax(rawDataBeginning.indexOf("% Encoding:"), rawDataBeginning.indexOf("% encoding:")); 1597 if (jabrefencodingpos >= 0) { 1598 int i = jabrefencodingpos + 11, l = 0; 1599 encoding.clear(); 1600 encoding.reserve(32); 1601 while (l < 32 && rawData.at(i) >= 0x20 && rawData.at(i) != QLatin1Char('\n') && rawData.at(i) != QLatin1Char('\r') && rawData.at(i) != QLatin1Char('}') && rawData.at(i) != QLatin1Char(')') && static_cast<unsigned char>(rawData.at(i)) < 0x80) { 1602 encoding.append(QLatin1Char(rawData.at(i))); 1603 ++i; 1604 ++l; 1605 } 1606 encoding = encoding.trimmed(); 1607 rawData = rawData.left(jabrefencodingpos) + rawData.mid(i + 1); ///< remove encoding comment 1608 encodingMayGetDeterminedByRawData = encoding.isEmpty(); 1609 } else { 1610 bool prevByteHadMSBset = false; 1611 bool prevPrevByteHadMSBset = false; 1612 for (const char &c : rawDataBeginning) { 1613 const bool hasMSBset{static_cast<unsigned char>(c) >= 128}; 1614 if (!prevPrevByteHadMSBset && prevByteHadMSBset && !hasMSBset) { 1615 // There was a single byte which had its most-significant bit (MSB) set, 1616 // surrounded by pure-ASCII bytes. As at least in UTF-8 no single bytes 1617 // with MSB set exist, guess that the data is ISO-8859-15, which seems 1618 // to be the most popular non-ASCII and non-Unicode encoding 1619 encoding = QStringLiteral("ISO-8859-15"); 1620 encodingMayGetDeterminedByRawData = false; 1621 break; 1622 } 1623 prevPrevByteHadMSBset = prevByteHadMSBset; 1624 prevByteHadMSBset = hasMSBset; 1625 } 1626 } 1627 } 1628 } 1629 1630 if (encoding.isEmpty()) { 1631 encoding = Preferences::instance().bibTeXEncoding(); ///< just in case something went wrong 1632 encodingMayGetDeterminedByRawData = true; 1633 } 1634 1635 if (encodingMayGetDeterminedByRawData) { 1636 // Unclear which encoding raw data makes use of, so test for 1637 // two popular choices: (1) only ASCII (means 'LaTeX' encoding) 1638 // and (2) UTF-8 1639 bool hasUTF8 = false; 1640 bool outsideUTF8 = false; 1641 const int len = qMin(2048, rawData.length() - 3); 1642 for (int i = 0; i < len; ++i) { 1643 const char c1 = rawData.at(i); 1644 if ((c1 & 0x80) == 0) { 1645 // This character is probably ASCII, so ignore it 1646 } else { 1647 const char c2 = rawData.at(i + 1); 1648 if ((c1 & 0xe0) == 0xc0 && (c2 & 0xc0) == 0x80) { 1649 // This is a two-byte UTF-8 symbol 1650 hasUTF8 = true; 1651 ++i; 1652 } else { 1653 const char c3 = rawData.at(i + 2); 1654 if ((c1 & 0xf0) == 0xe0 && (c2 & 0xc0) == 0x80 && (c3 & 0xc0) == 0x80) { 1655 // This is a three-byte UTF-8 symbol 1656 hasUTF8 = true; 1657 i += 2; 1658 } else { 1659 const char c4 = rawData.at(i + 3); 1660 if ((c1 & 0xf8) == 0xf0 && (c2 & 0xc0) == 0x80 && (c3 & 0xc0) == 0x80 && (c4 & 0xc0) == 0x80) { 1661 // This is a four-byte UTF-8 symbol 1662 hasUTF8 = true; 1663 i += 3; 1664 } else { 1665 outsideUTF8 = true; 1666 break; //< No point in further testing more raw data 1667 } 1668 } 1669 } 1670 } 1671 } 1672 1673 if (!outsideUTF8) { 1674 encoding = hasUTF8 ? QStringLiteral("UTF-8") : QStringLiteral("LaTeX"); 1675 encodingMayGetDeterminedByRawData = false; //< Now the encoding is known 1676 } 1677 } 1678 1679 encoding = encoding.toLower(); 1680 if (encoding == QStringLiteral("us-ascii")) { 1681 qCDebug(LOG_KBIBTEX_IO) << "Replacing deprecated encoding 'US-ASCII' with 'LaTeX'"; 1682 encoding = QStringLiteral("latex"); //< encoding 'US-ASCII' is deprecated in favour of 'LaTeX' 1683 } 1684 // For encoding 'LaTeX', fall back to encoding 'UTF-8' when creating 1685 // a QTextCodec instance, but keep 'LaTeX' as the bibliography's 'actual' encoding (used as its encoding property) 1686 QTextCodec *codec = QTextCodec::codecForName(encoding == QStringLiteral("latex") ? "utf-8" : encoding.toLatin1()); 1687 if (codec == nullptr) { 1688 qCWarning(LOG_KBIBTEX_IO) << "Could not determine codec for encoding" << encoding; 1689 Q_EMIT message(MessageSeverity::Warning, QString(QStringLiteral("Could not determine codec for encoding '%1'")).arg(encoding)); 1690 return nullptr; 1691 } 1692 QString rawText = codec->toUnicode(rawData); 1693 1694 /// Remove deprecated 'x-kbibtex-personnameformatting' from BibTeX raw text 1695 const int posPersonNameFormatting = rawText.indexOf(QStringLiteral("@comment{x-kbibtex-personnameformatting=")); 1696 if (posPersonNameFormatting >= 0) { 1697 const int endOfPersonNameFormatting = rawText.indexOf(QLatin1Char('}'), posPersonNameFormatting + 39); 1698 if (endOfPersonNameFormatting > 0) 1699 rawText = rawText.left(posPersonNameFormatting) + rawText.mid(endOfPersonNameFormatting + 1); 1700 } 1701 1702 File *result = fromString(rawText); 1703 /// In the File object's property, store the encoding used to load the data 1704 result->setProperty(File::Encoding, encoding); 1705 1706 return result; 1707 } 1708 1709 bool FileImporterBibTeX::guessCanDecode(const QString &rawText) 1710 { 1711 static const QRegularExpression bibtexLikeText(QStringLiteral("@\\w+\\{.+\\}")); 1712 QString text = EncoderLaTeX::instance().decode(rawText); 1713 return bibtexLikeText.match(text).hasMatch(); 1714 } 1715 1716 void FileImporterBibTeX::cancel() 1717 { 1718 m_cancelFlag = true; 1719 } 1720 1721 QList<QSharedPointer<Keyword> > FileImporterBibTeX::splitKeywords(const QString &text, char *usedSplitChar) 1722 { 1723 QList<QSharedPointer<Keyword> > result; 1724 static const QHash<char, QRegularExpression> splitAlong = { 1725 {'\n', QRegularExpression(QStringLiteral("\\s*\n\\s*"))}, 1726 {';', QRegularExpression(QStringLiteral("\\s*;\\s*"))}, 1727 {',', QRegularExpression(QStringLiteral("\\s*,\\s*"))} 1728 }; 1729 if (usedSplitChar != nullptr) 1730 *usedSplitChar = '\0'; 1731 1732 for (auto it = splitAlong.constBegin(); it != splitAlong.constEnd(); ++it) { 1733 /// check if character is contained in text (should be cheap to test) 1734 if (text.contains(QLatin1Char(it.key()))) { 1735 /// split text along a pattern like spaces-splitchar-spaces 1736 /// extract keywords 1737 static const QRegularExpression unneccessarySpacing(QStringLiteral("[ \n\r\t]+")); 1738 #if QT_VERSION >= 0x050e00 1739 const QStringList keywords = text.split(it.value(), Qt::SkipEmptyParts).replaceInStrings(unneccessarySpacing, QStringLiteral(" ")); 1740 #else // QT_VERSION < 0x050e00 1741 const QStringList keywords = text.split(it.value(), QString::SkipEmptyParts).replaceInStrings(unneccessarySpacing, QStringLiteral(" ")); 1742 #endif // QT_VERSION >= 0x050e00 1743 /// build QList of Keyword objects from keywords 1744 for (const QString &keyword : keywords) { 1745 result.append(QSharedPointer<Keyword>(new Keyword(keyword))); 1746 } 1747 /// Memorize (some) split characters for later use 1748 /// (e.g. when writing file again) 1749 if (usedSplitChar != nullptr) 1750 *usedSplitChar = it.key(); 1751 /// no more splits necessary 1752 break; 1753 } 1754 } 1755 1756 /// no split was performed, so whole text must be a single keyword 1757 if (result.isEmpty()) 1758 result.append(QSharedPointer<Keyword>(new Keyword(text))); 1759 1760 return result; 1761 } 1762 1763 QList<QSharedPointer<Person> > FileImporterBibTeX::splitNames(const QString &text) 1764 { 1765 /// Case: Smith, John and Johnson, Tim 1766 /// Case: Smith, John and Fulkerson, Ford and Johnson, Tim 1767 /// Case: Smith, John, Fulkerson, Ford, and Johnson, Tim 1768 /// Case: John Smith and Tim Johnson 1769 /// Case: John Smith and Ford Fulkerson and Tim Johnson 1770 /// Case: Smith, John, Johnson, Tim 1771 /// Case: Smith, John, Fulkerson, Ford, Johnson, Tim 1772 /// Case: John Smith, Tim Johnson 1773 /// Case: John Smith, Tim Johnson, Ford Fulkerson 1774 /// Case: Smith, John ; Johnson, Tim ; Fulkerson, Ford (IEEE Xplore) 1775 /// German case: Robert A. Gehring und Bernd Lutterbeck 1776 1777 QString internalText = text; 1778 1779 /// Remove invalid characters such as dots or (double) daggers for footnotes 1780 static const QList<QChar> invalidChars {QChar(0x00b7), QChar(0x2020), QChar(0x2217), QChar(0x2021), QChar(0x002a), QChar(0x21d1) /** Upwards double arrow */}; 1781 for (const auto &invalidChar : invalidChars) 1782 /// Replacing daggers with commas ensures that they act as persons' names separator 1783 internalText = internalText.replace(invalidChar, QLatin1Char(',')); 1784 /// Remove numbers to footnotes 1785 static const QRegularExpression numberFootnoteRegExp(QStringLiteral("(\\w)\\d+\\b")); 1786 internalText = internalText.replace(numberFootnoteRegExp, QStringLiteral("\\1")); 1787 /// Remove academic degrees 1788 static const QRegularExpression academicDegreesRegExp(QStringLiteral("(,\\s*)?(MA|PhD)\\b")); 1789 internalText = internalText.remove(academicDegreesRegExp); 1790 /// Remove email addresses 1791 static const QRegularExpression emailAddressRegExp(QStringLiteral("\\b[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9]@[a-z0-9][a-z0-9-]*([.][a-z0-9-]+)*([.][a-z]+)+\\b")); 1792 internalText = internalText.remove(emailAddressRegExp); 1793 1794 /// Split input string into tokens which are either name components (first or last name) 1795 /// or full names (composed of first and last name), depending on the input string's structure 1796 static const QRegularExpression split(QStringLiteral("\\s*([,]+|[,]*\\b[au]nd\\b|[;]|&|\u00b7|\u2022|\\n|\\s{4,})\\s*")); 1797 #if QT_VERSION >= 0x050e00 1798 const QStringList authorTokenList = internalText.split(split, Qt::SkipEmptyParts); 1799 #else // QT_VERSION < 0x050e00 1800 const QStringList authorTokenList = internalText.split(split, QString::SkipEmptyParts); 1801 #endif // QT_VERSION >= 0x050e00 1802 1803 bool containsSpace = true; 1804 for (QStringList::ConstIterator it = authorTokenList.constBegin(); containsSpace && it != authorTokenList.constEnd(); ++it) 1805 containsSpace = (*it).contains(QLatin1Char(' ')); 1806 1807 QList<QSharedPointer<Person> > result; 1808 result.reserve(authorTokenList.size()); 1809 if (containsSpace) { 1810 /// Tokens look like "John Smith" 1811 for (const QString &authorToken : authorTokenList) { 1812 QSharedPointer<Person> person = Private::personFromString(authorToken, nullptr, 1, nullptr); 1813 if (!person.isNull()) 1814 result.append(person); 1815 } 1816 } else { 1817 /// Tokens look like "Smith" or "John" 1818 /// Assumption: two consecutive tokens form a name 1819 for (QStringList::ConstIterator it = authorTokenList.constBegin(); it != authorTokenList.constEnd(); ++it) { 1820 QString lastname = *it; 1821 ++it; 1822 if (it != authorTokenList.constEnd()) { 1823 lastname += QStringLiteral(", ") + (*it); 1824 QSharedPointer<Person> person = Private::personFromString(lastname, nullptr, 1, nullptr); 1825 if (!person.isNull()) 1826 result.append(person); 1827 } else 1828 break; 1829 } 1830 } 1831 1832 return result; 1833 } 1834 1835 QSharedPointer<Person> FileImporterBibTeX::personFromString(const QString &name, const int line_number, QObject *parent) 1836 { 1837 // TODO Merge with FileImporter::splitName 1838 return Private::personFromString(name, nullptr, line_number, parent); 1839 } 1840 1841 void FileImporterBibTeX::parsePersonList(const QString &text, Value &value, const int line_number, QObject *parent) 1842 { 1843 Private::parsePersonList(text, value, nullptr, line_number, parent); 1844 } 1845 1846 1847 void FileImporterBibTeX::contextSensitiveSplit(const QString &text, QStringList &segments) 1848 { 1849 // TODO Merge with FileImporter::splitName and FileImporterBibTeX::personFromString 1850 int bracketCounter = 0; ///< keep track of opening and closing brackets: {...} 1851 QString buffer; 1852 int len = text.length(); 1853 segments.clear(); ///< empty list for results before proceeding 1854 1855 for (int pos = 0; pos < len; ++pos) { 1856 if (text[pos] == QLatin1Char('{')) 1857 ++bracketCounter; 1858 else if (text[pos] == QLatin1Char('}')) 1859 --bracketCounter; 1860 1861 if (text[pos].isSpace() && bracketCounter == 0) { 1862 if (!buffer.isEmpty()) { 1863 segments.append(buffer); 1864 buffer.clear(); 1865 } 1866 } else 1867 buffer.append(text[pos]); 1868 } 1869 1870 if (!buffer.isEmpty()) 1871 segments.append(buffer); 1872 } 1873 1874 QString FileImporterBibTeX::bibtexAwareSimplify(const QString &text) 1875 { 1876 QString result; 1877 int i = 0; 1878 1879 /// Consume initial spaces ... 1880 while (i < text.length() && text[i].isSpace()) ++i; 1881 /// ... but if there have been spaces (i.e. i>0), then record a single space only 1882 if (i > 0) 1883 result.append(QStringLiteral(" ")); 1884 1885 while (i < text.length()) { 1886 /// Consume non-spaces 1887 while (i < text.length() && !text[i].isSpace()) { 1888 result.append(text[i]); 1889 ++i; 1890 } 1891 1892 /// String may end with a non-space 1893 if (i >= text.length()) break; 1894 1895 /// Consume spaces, ... 1896 while (i < text.length() && text[i].isSpace()) ++i; 1897 /// ... but record only a single space 1898 result.append(QStringLiteral(" ")); 1899 } 1900 1901 return result; 1902 } 1903 1904 QString FileImporterBibTeX::rstrip(const QString &text) 1905 { 1906 for (int p = text.length() - 1; p >= 0; --p) 1907 if (!text.at(p).isSpace()) 1908 return text.left(p + 1); 1909 return QString(); 1910 } 1911 1912 void FileImporterBibTeX::setCommentHandling(CommentHandling commentHandling) { 1913 d->commentHandling = commentHandling; 1914 }