File indexing completed on 2025-02-16 04:40:50

0001 ###########################################################################
0002 #   SPDX-License-Identifier: GPL-2.0-or-later
0003 #                                                                         #
0004 #   SPDX-FileCopyrightText: 2023 Thomas Fischer <fischer@unix-ag.uni-kl.de>
0005 #                                                                         #
0006 #   This script is free software; you can redistribute it and/or modify   #
0007 #   it under the terms of the GNU General Public License as published by  #
0008 #   the Free Software Foundation; either version 2 of the License, or     #
0009 #   (at your option) any later version.                                   #
0010 #                                                                         #
0011 #   This script is distributed in the hope that it will be useful,        #
0012 #   but WITHOUT ANY WARRANTY; without even the implied warranty of        #
0013 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
0014 #   GNU General Public License for more details.                          #
0015 #                                                                         #
0016 #   You should have received a copy of the GNU General Public License     #
0017 #   along with this script; if not, see <https://www.gnu.org/licenses/>.  #
0018 ###########################################################################
0019 
0020 
0021 import sys
0022 import string
0023 import typing
0024 import zlib
0025 
0026 # Store information read from .txt files
0027 format = ""
0028 entries = ""
0029 introduction = ""
0030 postprocessingmapping = ""
0031 postprocessingfields = ""
0032 entryvalidation = ""
0033 entrytype = ""
0034 entryid = ""
0035 field: typing.Dict[str, str] = dict()
0036 valueItemType: typing.Dict[str, str] = dict()
0037 valuemap: typing.Dict[str, str] = dict()
0038 
0039 
0040 def alphanumhash(text: str) -> str:
0041     """Hash arbitrary text (string) into a sequence of upper-case letters (length 6 is fixed)"""
0042     i = zlib.adler32(text.encode("utf-8"))
0043     result = ""
0044     l = len(string.ascii_uppercase)
0045     for _ in range(6):
0046         result += string.ascii_uppercase[i % l]
0047         i = int(i // l)
0048     return result
0049 
0050 
0051 def rewriteVariablePlaceholder(text: str) -> str:
0052     """In a given text, replace placeholders (those in double curly brackets) with data read from the XML file"""
0053     p1 = text.find("{{")
0054     while p1 >= 0:
0055         p2 = text.find("}}", p1 + 2)
0056         if p2 > p1:
0057             replacement = text[p1 + 2 : p2]
0058             if replacement.startswith("field["):
0059                 # {{field[Entry::ftDOI]}}  or  {{field["doi"]}}  allows to get a textual representation
0060                 # of data already stored in the entry under constructions
0061                 replacement = replacement[6:-1]
0062                 if len(replacement) > 2 and replacement[0] == '"' and replacement[-1] == '"':
0063                     # Wrap double-quoted strings into a QStringLiteral
0064                     replacement = f"QStringLiteral({replacement})"
0065                 replacement = f"PlainTextValue::text(entry->value({replacement}))"
0066             elif replacement.startswith("QStringList:"):
0067                 # {{QStringList:authors/author}}  will create a QStringList of data found in XML tags as specified after the colon
0068                 replacement = replacement[12:]  # clip away  QStringList:  at the beginning
0069                 replacement = f'const_cast<const QStringList &>(mapping[QStringLiteral("{replacement}")])'
0070             else:
0071                 # {{article/title}}  will create a string from the data found in XML tags matching the argument
0072                 # If several instances of the XML tags exist, they are separated by line breaks
0073                 replacement = f'mapping[QStringLiteral("{replacement}")].join(QStringLiteral("\\n"))'
0074             text = text[:p1] + replacement + text[p2 + 2 :]
0075         p1 = text.find("{{", p1 + 2)
0076 
0077     # Final replacement in case text to process contains a lambda function:
0078     # pass  mapping  to the inside of the lambda function
0079     return text.replace("[]()", "[&mapping]()")
0080 
0081 
0082 def indentCppCode(text: str, depth: int) -> str:
0083     """Indent C/C++ code starting at a certain depth, except for the first line that does not get indented"""
0084     lines = text.split("\n")
0085     if len(lines) <= 1:
0086         # As first line does not get indented, return  text  immediately for single-line text
0087         return text
0088     else:
0089         result: str = ""
0090         singlelineindent: bool = False
0091         for i, l in enumerate(lines):
0092             l.strip()  # remove surrounding whitespace, which will be added later if needed
0093             if i == 0:
0094                 # First line does not get indented, so put it directly into the result
0095                 result = l
0096                 continue
0097             else:
0098                 # Increase or decrease indent if code blocks are entered or left, respectively
0099                 if len(lines[i - 1]) > 0 and lines[i - 1][-1] == "{":
0100                     depth += 4
0101                 if len(l) > 0 and l[0] == "}":
0102                     depth -= 4
0103                 # Add line's text with current indentation
0104                 result += "\n" + depth * " " + l
0105                 if singlelineindent and not l.startswith("//"):
0106                     # Stop indenting unless line is a comment
0107                     singlelineindent = False
0108                     depth -= 4
0109                 elif (l.startswith("if") or l.startswith("while") or l.startswith("for") or "else if" in l or l.endswith("else")) and l[
0110                     -1
0111                 ] != "{":
0112                     # If special commands like  if  or  while  are in the current line but no opening curly bracket is followed,
0113                     # the remember to indent only the next line
0114                     singlelineindent = True
0115                     depth += 4
0116         return result
0117 
0118 
0119 def xmlParser():
0120     """Generate C++ that can parse XML code as specified in the .txt file (various global variables hold this data already)."""
0121 
0122     if not introduction is None and len(introduction) > 0:
0123         print(introduction, sep="", end="\n\n")  # Print 'introduction' sans << ... >>
0124 
0125     depth: int = 8
0126     # Read XML file until reaching entries' tags
0127     print(depth * " ", "*ok = true;", sep="")
0128     print(depth * " ", "QXmlStreamReader xsr(xmlData);", sep="")
0129 
0130     # Dive into XML document to get to the point where the bibliographic data begins
0131     entriesPath = entries.split("/")
0132     print(depth * " ", "if (xsr.readNextStartElement()) {", sep="")
0133     depth += 4
0134     print(depth * " ", f'if (xsr.qualifiedName() == QStringLiteral("{entriesPath[0]}"))', " {", sep="")
0135     for step in entriesPath[1:]:
0136         depth += 4
0137         print(depth * " ", "while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid) {", sep="")
0138         depth += 4
0139         print(depth * " ", f'if (xsr.isStartElement() && xsr.qualifiedName() == QStringLiteral("{step}"))', " {", sep="")
0140     depth += 4
0141 
0142     # Stack variable keeps track of position inside XML data
0143     print(depth * " ", "QStringList stack;", sep="")
0144     # Some XML tags have a 'type' attribute which will be tracked as it is useful in some situations
0145     # Example  <id type="doi">10.100/abc</id>  allows to address the text via  id/[@type=doi]
0146     print(depth * " ", "QPair<QString, QString> typeAttribute;", sep="")
0147     # Keep track of data extract from XML data, e.g. "articles/authors/author"  -> ["John Doe", "Jane Done"]
0148     print(depth * " ", "QMap<QString, QStringList> mapping;", sep="")
0149     # Inside the following loop, one entry is processed
0150     print(
0151         depth * " ",
0152         f'while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid && xsr.qualifiedName() != QStringLiteral("{entriesPath[-1]}"))',
0153         " {",
0154         sep="",
0155     )
0156     depth += 4
0157 
0158     print(depth * " ", "if (xsr.isStartElement()) {", sep="")
0159     depth += 4
0160 
0161     # If at the opening of a XML element ...
0162     # Append element's qualified name to stack
0163     print(depth * " ", "stack.append(xsr.qualifiedName().toString());", sep="")
0164     # Clear previous type attributes
0165     print(depth * " ", "typeAttribute = qMakePair(QString(), QString());", sep="")
0166 
0167     print(depth * " ", "for (const QXmlStreamAttribute &attr : xsr.attributes()) {", sep="")
0168     # Go over all attributes of this XML element ...
0169     depth += 4
0170 
0171     print(depth * " ", "const QString text{OnlineSearchAbstract::deHTMLify(attr.value().toString().trimmed())};", sep="")
0172     print(depth * " ", "if (!text.isEmpty()) {", sep="")
0173     depth += 4
0174 
0175     # For attributes with non-empty text ...
0176     print(depth * " ", 'if (attr.qualifiedName().toString().toLower().contains(QStringLiteral("type")))', sep="")
0177     # If this attribute looks like a 'type' attribute, ...
0178     print((depth + 4) * " ", "typeAttribute = qMakePair(attr.qualifiedName().toString(), text);", sep="")
0179     print(depth * " ", 'const QString key{stack.join(QStringLiteral("/")) + QStringLiteral("/@") + attr.qualifiedName().toString()};', sep="")
0180     # ... store its name and its value for later use
0181     print(depth * " ", "if (mapping.contains(key))", sep="")
0182     print((depth + 4) * " ", "mapping[key].append(text);", sep="")
0183     print(depth * " ", "else", sep="")
0184     print((depth + 4) * " ", "mapping.insert(key, QStringList() << text);", sep="")
0185     depth -= 4
0186     print(depth * " ", "}", sep="")
0187     depth -= 4
0188     print(depth * " ", "}", sep="")
0189     depth -= 4
0190     print(depth * " ", "} else if (xsr.isEndElement() && stack.length() > 0 && stack.last() == xsr.qualifiedName())", sep="")
0191     # If at the closing of a XML element ...
0192     # Remove element's qualified name from stack
0193     print((depth + 4) * " ", "stack.removeLast();", sep="")
0194     print(depth * " ", "else if (xsr.isCharacters()) {", sep="")
0195     depth += 4
0196     # If reading plain text from the XML stream ...
0197     print(depth * " ", "const QString text{OnlineSearchAbstract::deHTMLify(xsr.text().toString().trimmed())};", sep="")
0198     print(depth * " ", "if (!text.isEmpty()) {", sep="")
0199     depth += 4
0200     # Record this text as content for the XML element's path as recorded by the stack
0201     print(depth * " ", 'const QString key{stack.join(QStringLiteral("/"))};', sep="")
0202     print(depth * " ", "if (mapping.contains(key))", sep="")
0203     print((depth + 4) * " ", "mapping[key].append(text);", sep="")
0204     print(depth * " ", "else", sep="")
0205     print((depth + 4) * " ", "mapping.insert(key, QStringList() << text);", sep="")
0206     print(depth * " ", "if (!typeAttribute.first.isEmpty() && !typeAttribute.second.isEmpty()) {", sep="")
0207     depth += 4
0208     # If a type attribute was recoded, store the plain text for this type attribute specifically, too
0209     print(
0210         depth * " ",
0211         'const QString key{stack.join(QStringLiteral("/")) + QStringLiteral("[@") + typeAttribute.first + QStringLiteral("=") + typeAttribute.second + QStringLiteral("]")};',
0212         sep="",
0213     )
0214     print(depth * " ", "if (mapping.contains(key))", sep="")
0215     print((depth + 4) * " ", "mapping[key].append(text);", sep="")
0216     print(depth * " ", "else", sep="")
0217     print((depth + 4) * " ", "mapping.insert(key, QStringList() << text);", sep="")
0218     depth -= 4
0219     print(depth * " ", "}", sep="")
0220     depth -= 4
0221     print(depth * " ", "}", sep="")
0222     depth -= 4
0223     print(depth * " ", "}", sep="")
0224     depth -= 4
0225     print(
0226         depth * " ", "}", sep=""
0227     )  # while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid and xsr.qualifiedName() != QStringLiteral( ...
0228 
0229     print(depth * " ", "if (xsr.tokenType() == QXmlStreamReader::Invalid)", sep="")
0230     print(
0231         (depth + 4) * " ",
0232         'qCWarning(LOG_KBIBTEX_NETWORKING) << "Invalid XML while parsing data at offset" << xsr.characterOffset() << ":" << xsr.errorString();',
0233         sep="",
0234         end="\n\n",
0235     )
0236 
0237     # Maybe some post-processing is necessary after mapping has been filled?
0238     if not postprocessingmapping is None and len(postprocessingmapping) > 0:
0239         print(rewriteVariablePlaceholder(postprocessingmapping), sep="", end="\n\n")
0240 
0241     # Create an entry object, but postpone setting type and id for later
0242     print(
0243         depth * " ",
0244         'QSharedPointer<Entry> entry = QSharedPointer<Entry>(new Entry(QStringLiteral("placeholderType"), QStringLiteral("placeholderId")));',
0245         sep="",
0246     )
0247 
0248     # Fill the entry with data
0249 
0250     # Set all fields, i.e. where the .txt file had configuration where keys started with  field[
0251     for fkey, fvalue in field.items():
0252         # Determine type of field data: Either from configuration file or via educated guess
0253         fieldType = "PlainText"
0254         if fkey in valueItemType:
0255             fieldType = valueItemType[fkey]
0256         elif fkey in {"Entry::ftDOI", "Entry::ftUrl"}:
0257             fieldType = "VerbatimText"
0258         elif fkey in {"Entry::ftMonth"}:
0259             fieldType = "MacroKey"
0260 
0261         varname = f"value{alphanumhash(fkey)}"
0262         # Handle field names with quotation marks instead of predefined ones like Entry::ftDOI
0263         if len(fkey) > 2 and fkey[0] == '"' and fkey[-1] == '"':
0264             fkey = f"QStringLiteral({fkey})"
0265         # Replace/rewrite any  {{...}}  variables inside the configuration file data for this field
0266         fvalue = indentCppCode(rewriteVariablePlaceholder(fvalue), depth)
0267 
0268         # First, assign the computed value for this field in a QString
0269         print(depth * " ", f"const QString {varname} ", sep="", end="")
0270         if len(fvalue) > 2 and fvalue[0] == "{" and fvalue[-1] == "}":
0271             print(fvalue, ";", sep="")
0272         else:
0273             print("= ", fvalue, ";", sep="")
0274         # Then, only if it is a non-empty string, store this string in the entry
0275         print(depth * " ", f"if (!{varname}.isEmpty())", sep="")
0276         print((depth + 4) * " ", f"entry->insert({fkey}, Value() << QSharedPointer<{fieldType}>(new {fieldType}({varname})));", sep="")
0277 
0278     # Set fields where not a string is given, but the configuration file contains code fragments that
0279     # generate Value objects that can be directly assigned to the entry's fields
0280     for vkey, vvalue in valuemap.items():
0281         varname = f"value{alphanumhash(vkey)}"
0282         if len(vkey) > 2 and vkey[0] == '"' and vkey[-1] == '"':
0283             vkey = f"QStringLiteral({vkey})"
0284         print(depth * " ", f"const Value {varname} = {indentCppCode(rewriteVariablePlaceholder(vvalue),depth)};", sep="")
0285         print(depth * " ", f"if (!{varname}.isEmpty())", sep="")
0286         print((depth + 4) * " ", f"entry->insert({vkey}, {varname});", sep="")
0287 
0288     # Maybe some post-processing is necessary after all fields have been set?
0289     if not postprocessingfields is None and len(postprocessingfields) > 0:
0290         print("\n", depth * " ", indentCppCode(rewriteVariablePlaceholder(postprocessingfields), depth), sep="")
0291 
0292     # As it was postponed, set entry's id and type now
0293     print("\n", depth * " ", f"entry->setId({rewriteVariablePlaceholder(entryid)});", sep="")
0294     print(depth * " ", f"entry->setType({indentCppCode(rewriteVariablePlaceholder(entrytype),depth)});", sep="")
0295 
0296     # Finally, append the entry to the list of resulting entries
0297     if len(entryvalidation) == 0:
0298         print(depth * " ", "result.append(entry);", sep="")
0299     else:
0300         print(depth * " ", "if (", entryvalidation, ")", sep="")
0301         print((depth + 4) * " ", "result.append(entry);", sep="")
0302 
0303     # Close all loops and earlier checks
0304 
0305     for _ in entriesPath[1:]:
0306         depth -= 4
0307         print(depth * " ", "}", sep="")  # if (xsr.isStartElement() && xsr.qualifiedName()==QStringLiteral( ...
0308         depth -= 4
0309         print(depth * " ", "}", sep="")  # while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid) { ...
0310         print(depth * " ", "if (xsr.tokenType() == QXmlStreamReader::Invalid)", sep="")
0311         print(
0312             (depth + 4) * " ",
0313             'qCWarning(LOG_KBIBTEX_NETWORKING) << "Invalid XML while parsing data at offset" << xsr.characterOffset() << ":" << xsr.errorString();',
0314             sep="",
0315             end="\n\n",
0316         )
0317     depth -= 4
0318     print(depth * " ", "} else {", sep="")
0319     depth += 4
0320     print(
0321         depth * " ",
0322         'qCWarning(LOG_KBIBTEX_NETWORKING) << "Expected ',
0323         "'",
0324         entriesPath[0],
0325         "'",
0326         ', got" << xsr.qualifiedName() << "at XML line" << xsr.lineNumber() << ":" << xsr.errorString();',
0327         sep="",
0328     )
0329     print(depth * " ", "*ok = false;", sep="")
0330     depth -= 4
0331     print(depth * " ", "}", sep="")
0332     depth -= 4
0333     print(depth * " ", "} else {", sep="")
0334     depth += 4
0335     print(
0336         depth * " ",
0337         'qCWarning(LOG_KBIBTEX_NETWORKING) << "Could not read start element at XML line" << xsr.lineNumber() << ":" << xsr.errorString();',
0338         sep="",
0339     )
0340     print(depth * " ", "*ok = false;", sep="")
0341     depth -= 4
0342     print(depth * " ", "}", sep="")
0343     print("\n")
0344 
0345 
0346 # Read configuration file provided as the single argument to this Python script invocation
0347 with open(sys.argv[-1]) as input:
0348     for line in input:
0349         # Remove whitespace on right side
0350         line = line.rstrip()
0351         # Skip empty lines or comments
0352         if len(line) == 0 or line[0] == "#":
0353             continue
0354         # Maybe line can be split into key-value pairs?
0355         colonpos = line.find(": ")
0356         if len(line) > 1 and line[0] == " ":
0357             # Lines that start with spaces are continuations of a previous line,
0358             # so the value of this line is its content, stripped from surrounding spaces
0359             value = "\n" + line.strip()
0360         elif colonpos > 0:
0361             # So there is a separator string (': '), so split into key-value pair
0362             key = line[:colonpos].strip()
0363             value = line[colonpos + 2 :].strip()
0364 
0365         # Update various global variables, depending on value read from line
0366         if key == "format":
0367             format += value
0368         elif key == "entries":
0369             entries += value
0370         elif key == "introduction":
0371             introduction += "        " + value
0372         elif key == "entryvalidation":
0373             entryvalidation += value
0374         elif key == "postprocessingmapping":
0375             postprocessingmapping += value
0376         elif key == "postprocessingfields":
0377             postprocessingfields += value
0378         elif key == "entrytype":
0379             entrytype += value
0380         elif key == "entryid":
0381             entryid += value
0382         elif key.startswith("field["):
0383             fkey = key[6:-1]
0384             field.setdefault(fkey, "")
0385             field[fkey] += value
0386         elif key.startswith("valueItemType["):
0387             vitkey = key[14:-1]
0388             valueItemType.setdefault(vitkey, "")
0389             valueItemType[vitkey] += value
0390         elif key.startswith("value["):
0391             vitkey = key[6:-1]
0392             valuemap.setdefault(vitkey, "")
0393             valuemap[vitkey] += value
0394 
0395 
0396 if not format in {"xml", "json"}:
0397     raise ValueError("Missing or unsupported format: " + str(format))
0398 
0399 # Beginning of generated C++ code, hinting that the following code is generated by this script
0400 # and shall not be manually messed around
0401 print("        // Source code generated by Python script 'codegenerator-dataparser.py'")
0402 print(f"        // using information from configuration file '{sys.argv[-1]}'", end="\n\n")
0403 
0404 if format == "xml":
0405     xmlParser()
0406 elif format == "json":
0407     # TODO
0408     pass
0409     # jsonParser()