File indexing completed on 2025-02-16 04:40:50
0001 ########################################################################### 0002 # SPDX-License-Identifier: GPL-2.0-or-later 0003 # # 0004 # SPDX-FileCopyrightText: 2023 Thomas Fischer <fischer@unix-ag.uni-kl.de> 0005 # # 0006 # This script is free software; you can redistribute it and/or modify # 0007 # it under the terms of the GNU General Public License as published by # 0008 # the Free Software Foundation; either version 2 of the License, or # 0009 # (at your option) any later version. # 0010 # # 0011 # This script is distributed in the hope that it will be useful, # 0012 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 0013 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 0014 # GNU General Public License for more details. # 0015 # # 0016 # You should have received a copy of the GNU General Public License # 0017 # along with this script; if not, see <https://www.gnu.org/licenses/>. # 0018 ########################################################################### 0019 0020 0021 import sys 0022 import string 0023 import typing 0024 import zlib 0025 0026 # Store information read from .txt files 0027 format = "" 0028 entries = "" 0029 introduction = "" 0030 postprocessingmapping = "" 0031 postprocessingfields = "" 0032 entryvalidation = "" 0033 entrytype = "" 0034 entryid = "" 0035 field: typing.Dict[str, str] = dict() 0036 valueItemType: typing.Dict[str, str] = dict() 0037 valuemap: typing.Dict[str, str] = dict() 0038 0039 0040 def alphanumhash(text: str) -> str: 0041 """Hash arbitrary text (string) into a sequence of upper-case letters (length 6 is fixed)""" 0042 i = zlib.adler32(text.encode("utf-8")) 0043 result = "" 0044 l = len(string.ascii_uppercase) 0045 for _ in range(6): 0046 result += string.ascii_uppercase[i % l] 0047 i = int(i // l) 0048 return result 0049 0050 0051 def rewriteVariablePlaceholder(text: str) -> str: 0052 """In a given text, replace placeholders (those in double curly brackets) with data read from the XML file""" 0053 p1 = text.find("{{") 0054 while p1 >= 0: 0055 p2 = text.find("}}", p1 + 2) 0056 if p2 > p1: 0057 replacement = text[p1 + 2 : p2] 0058 if replacement.startswith("field["): 0059 # {{field[Entry::ftDOI]}} or {{field["doi"]}} allows to get a textual representation 0060 # of data already stored in the entry under constructions 0061 replacement = replacement[6:-1] 0062 if len(replacement) > 2 and replacement[0] == '"' and replacement[-1] == '"': 0063 # Wrap double-quoted strings into a QStringLiteral 0064 replacement = f"QStringLiteral({replacement})" 0065 replacement = f"PlainTextValue::text(entry->value({replacement}))" 0066 elif replacement.startswith("QStringList:"): 0067 # {{QStringList:authors/author}} will create a QStringList of data found in XML tags as specified after the colon 0068 replacement = replacement[12:] # clip away QStringList: at the beginning 0069 replacement = f'const_cast<const QStringList &>(mapping[QStringLiteral("{replacement}")])' 0070 else: 0071 # {{article/title}} will create a string from the data found in XML tags matching the argument 0072 # If several instances of the XML tags exist, they are separated by line breaks 0073 replacement = f'mapping[QStringLiteral("{replacement}")].join(QStringLiteral("\\n"))' 0074 text = text[:p1] + replacement + text[p2 + 2 :] 0075 p1 = text.find("{{", p1 + 2) 0076 0077 # Final replacement in case text to process contains a lambda function: 0078 # pass mapping to the inside of the lambda function 0079 return text.replace("[]()", "[&mapping]()") 0080 0081 0082 def indentCppCode(text: str, depth: int) -> str: 0083 """Indent C/C++ code starting at a certain depth, except for the first line that does not get indented""" 0084 lines = text.split("\n") 0085 if len(lines) <= 1: 0086 # As first line does not get indented, return text immediately for single-line text 0087 return text 0088 else: 0089 result: str = "" 0090 singlelineindent: bool = False 0091 for i, l in enumerate(lines): 0092 l.strip() # remove surrounding whitespace, which will be added later if needed 0093 if i == 0: 0094 # First line does not get indented, so put it directly into the result 0095 result = l 0096 continue 0097 else: 0098 # Increase or decrease indent if code blocks are entered or left, respectively 0099 if len(lines[i - 1]) > 0 and lines[i - 1][-1] == "{": 0100 depth += 4 0101 if len(l) > 0 and l[0] == "}": 0102 depth -= 4 0103 # Add line's text with current indentation 0104 result += "\n" + depth * " " + l 0105 if singlelineindent and not l.startswith("//"): 0106 # Stop indenting unless line is a comment 0107 singlelineindent = False 0108 depth -= 4 0109 elif (l.startswith("if") or l.startswith("while") or l.startswith("for") or "else if" in l or l.endswith("else")) and l[ 0110 -1 0111 ] != "{": 0112 # If special commands like if or while are in the current line but no opening curly bracket is followed, 0113 # the remember to indent only the next line 0114 singlelineindent = True 0115 depth += 4 0116 return result 0117 0118 0119 def xmlParser(): 0120 """Generate C++ that can parse XML code as specified in the .txt file (various global variables hold this data already).""" 0121 0122 if not introduction is None and len(introduction) > 0: 0123 print(introduction, sep="", end="\n\n") # Print 'introduction' sans << ... >> 0124 0125 depth: int = 8 0126 # Read XML file until reaching entries' tags 0127 print(depth * " ", "*ok = true;", sep="") 0128 print(depth * " ", "QXmlStreamReader xsr(xmlData);", sep="") 0129 0130 # Dive into XML document to get to the point where the bibliographic data begins 0131 entriesPath = entries.split("/") 0132 print(depth * " ", "if (xsr.readNextStartElement()) {", sep="") 0133 depth += 4 0134 print(depth * " ", f'if (xsr.qualifiedName() == QStringLiteral("{entriesPath[0]}"))', " {", sep="") 0135 for step in entriesPath[1:]: 0136 depth += 4 0137 print(depth * " ", "while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid) {", sep="") 0138 depth += 4 0139 print(depth * " ", f'if (xsr.isStartElement() && xsr.qualifiedName() == QStringLiteral("{step}"))', " {", sep="") 0140 depth += 4 0141 0142 # Stack variable keeps track of position inside XML data 0143 print(depth * " ", "QStringList stack;", sep="") 0144 # Some XML tags have a 'type' attribute which will be tracked as it is useful in some situations 0145 # Example <id type="doi">10.100/abc</id> allows to address the text via id/[@type=doi] 0146 print(depth * " ", "QPair<QString, QString> typeAttribute;", sep="") 0147 # Keep track of data extract from XML data, e.g. "articles/authors/author" -> ["John Doe", "Jane Done"] 0148 print(depth * " ", "QMap<QString, QStringList> mapping;", sep="") 0149 # Inside the following loop, one entry is processed 0150 print( 0151 depth * " ", 0152 f'while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid && xsr.qualifiedName() != QStringLiteral("{entriesPath[-1]}"))', 0153 " {", 0154 sep="", 0155 ) 0156 depth += 4 0157 0158 print(depth * " ", "if (xsr.isStartElement()) {", sep="") 0159 depth += 4 0160 0161 # If at the opening of a XML element ... 0162 # Append element's qualified name to stack 0163 print(depth * " ", "stack.append(xsr.qualifiedName().toString());", sep="") 0164 # Clear previous type attributes 0165 print(depth * " ", "typeAttribute = qMakePair(QString(), QString());", sep="") 0166 0167 print(depth * " ", "for (const QXmlStreamAttribute &attr : xsr.attributes()) {", sep="") 0168 # Go over all attributes of this XML element ... 0169 depth += 4 0170 0171 print(depth * " ", "const QString text{OnlineSearchAbstract::deHTMLify(attr.value().toString().trimmed())};", sep="") 0172 print(depth * " ", "if (!text.isEmpty()) {", sep="") 0173 depth += 4 0174 0175 # For attributes with non-empty text ... 0176 print(depth * " ", 'if (attr.qualifiedName().toString().toLower().contains(QStringLiteral("type")))', sep="") 0177 # If this attribute looks like a 'type' attribute, ... 0178 print((depth + 4) * " ", "typeAttribute = qMakePair(attr.qualifiedName().toString(), text);", sep="") 0179 print(depth * " ", 'const QString key{stack.join(QStringLiteral("/")) + QStringLiteral("/@") + attr.qualifiedName().toString()};', sep="") 0180 # ... store its name and its value for later use 0181 print(depth * " ", "if (mapping.contains(key))", sep="") 0182 print((depth + 4) * " ", "mapping[key].append(text);", sep="") 0183 print(depth * " ", "else", sep="") 0184 print((depth + 4) * " ", "mapping.insert(key, QStringList() << text);", sep="") 0185 depth -= 4 0186 print(depth * " ", "}", sep="") 0187 depth -= 4 0188 print(depth * " ", "}", sep="") 0189 depth -= 4 0190 print(depth * " ", "} else if (xsr.isEndElement() && stack.length() > 0 && stack.last() == xsr.qualifiedName())", sep="") 0191 # If at the closing of a XML element ... 0192 # Remove element's qualified name from stack 0193 print((depth + 4) * " ", "stack.removeLast();", sep="") 0194 print(depth * " ", "else if (xsr.isCharacters()) {", sep="") 0195 depth += 4 0196 # If reading plain text from the XML stream ... 0197 print(depth * " ", "const QString text{OnlineSearchAbstract::deHTMLify(xsr.text().toString().trimmed())};", sep="") 0198 print(depth * " ", "if (!text.isEmpty()) {", sep="") 0199 depth += 4 0200 # Record this text as content for the XML element's path as recorded by the stack 0201 print(depth * " ", 'const QString key{stack.join(QStringLiteral("/"))};', sep="") 0202 print(depth * " ", "if (mapping.contains(key))", sep="") 0203 print((depth + 4) * " ", "mapping[key].append(text);", sep="") 0204 print(depth * " ", "else", sep="") 0205 print((depth + 4) * " ", "mapping.insert(key, QStringList() << text);", sep="") 0206 print(depth * " ", "if (!typeAttribute.first.isEmpty() && !typeAttribute.second.isEmpty()) {", sep="") 0207 depth += 4 0208 # If a type attribute was recoded, store the plain text for this type attribute specifically, too 0209 print( 0210 depth * " ", 0211 'const QString key{stack.join(QStringLiteral("/")) + QStringLiteral("[@") + typeAttribute.first + QStringLiteral("=") + typeAttribute.second + QStringLiteral("]")};', 0212 sep="", 0213 ) 0214 print(depth * " ", "if (mapping.contains(key))", sep="") 0215 print((depth + 4) * " ", "mapping[key].append(text);", sep="") 0216 print(depth * " ", "else", sep="") 0217 print((depth + 4) * " ", "mapping.insert(key, QStringList() << text);", sep="") 0218 depth -= 4 0219 print(depth * " ", "}", sep="") 0220 depth -= 4 0221 print(depth * " ", "}", sep="") 0222 depth -= 4 0223 print(depth * " ", "}", sep="") 0224 depth -= 4 0225 print( 0226 depth * " ", "}", sep="" 0227 ) # while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid and xsr.qualifiedName() != QStringLiteral( ... 0228 0229 print(depth * " ", "if (xsr.tokenType() == QXmlStreamReader::Invalid)", sep="") 0230 print( 0231 (depth + 4) * " ", 0232 'qCWarning(LOG_KBIBTEX_NETWORKING) << "Invalid XML while parsing data at offset" << xsr.characterOffset() << ":" << xsr.errorString();', 0233 sep="", 0234 end="\n\n", 0235 ) 0236 0237 # Maybe some post-processing is necessary after mapping has been filled? 0238 if not postprocessingmapping is None and len(postprocessingmapping) > 0: 0239 print(rewriteVariablePlaceholder(postprocessingmapping), sep="", end="\n\n") 0240 0241 # Create an entry object, but postpone setting type and id for later 0242 print( 0243 depth * " ", 0244 'QSharedPointer<Entry> entry = QSharedPointer<Entry>(new Entry(QStringLiteral("placeholderType"), QStringLiteral("placeholderId")));', 0245 sep="", 0246 ) 0247 0248 # Fill the entry with data 0249 0250 # Set all fields, i.e. where the .txt file had configuration where keys started with field[ 0251 for fkey, fvalue in field.items(): 0252 # Determine type of field data: Either from configuration file or via educated guess 0253 fieldType = "PlainText" 0254 if fkey in valueItemType: 0255 fieldType = valueItemType[fkey] 0256 elif fkey in {"Entry::ftDOI", "Entry::ftUrl"}: 0257 fieldType = "VerbatimText" 0258 elif fkey in {"Entry::ftMonth"}: 0259 fieldType = "MacroKey" 0260 0261 varname = f"value{alphanumhash(fkey)}" 0262 # Handle field names with quotation marks instead of predefined ones like Entry::ftDOI 0263 if len(fkey) > 2 and fkey[0] == '"' and fkey[-1] == '"': 0264 fkey = f"QStringLiteral({fkey})" 0265 # Replace/rewrite any {{...}} variables inside the configuration file data for this field 0266 fvalue = indentCppCode(rewriteVariablePlaceholder(fvalue), depth) 0267 0268 # First, assign the computed value for this field in a QString 0269 print(depth * " ", f"const QString {varname} ", sep="", end="") 0270 if len(fvalue) > 2 and fvalue[0] == "{" and fvalue[-1] == "}": 0271 print(fvalue, ";", sep="") 0272 else: 0273 print("= ", fvalue, ";", sep="") 0274 # Then, only if it is a non-empty string, store this string in the entry 0275 print(depth * " ", f"if (!{varname}.isEmpty())", sep="") 0276 print((depth + 4) * " ", f"entry->insert({fkey}, Value() << QSharedPointer<{fieldType}>(new {fieldType}({varname})));", sep="") 0277 0278 # Set fields where not a string is given, but the configuration file contains code fragments that 0279 # generate Value objects that can be directly assigned to the entry's fields 0280 for vkey, vvalue in valuemap.items(): 0281 varname = f"value{alphanumhash(vkey)}" 0282 if len(vkey) > 2 and vkey[0] == '"' and vkey[-1] == '"': 0283 vkey = f"QStringLiteral({vkey})" 0284 print(depth * " ", f"const Value {varname} = {indentCppCode(rewriteVariablePlaceholder(vvalue),depth)};", sep="") 0285 print(depth * " ", f"if (!{varname}.isEmpty())", sep="") 0286 print((depth + 4) * " ", f"entry->insert({vkey}, {varname});", sep="") 0287 0288 # Maybe some post-processing is necessary after all fields have been set? 0289 if not postprocessingfields is None and len(postprocessingfields) > 0: 0290 print("\n", depth * " ", indentCppCode(rewriteVariablePlaceholder(postprocessingfields), depth), sep="") 0291 0292 # As it was postponed, set entry's id and type now 0293 print("\n", depth * " ", f"entry->setId({rewriteVariablePlaceholder(entryid)});", sep="") 0294 print(depth * " ", f"entry->setType({indentCppCode(rewriteVariablePlaceholder(entrytype),depth)});", sep="") 0295 0296 # Finally, append the entry to the list of resulting entries 0297 if len(entryvalidation) == 0: 0298 print(depth * " ", "result.append(entry);", sep="") 0299 else: 0300 print(depth * " ", "if (", entryvalidation, ")", sep="") 0301 print((depth + 4) * " ", "result.append(entry);", sep="") 0302 0303 # Close all loops and earlier checks 0304 0305 for _ in entriesPath[1:]: 0306 depth -= 4 0307 print(depth * " ", "}", sep="") # if (xsr.isStartElement() && xsr.qualifiedName()==QStringLiteral( ... 0308 depth -= 4 0309 print(depth * " ", "}", sep="") # while (!xsr.atEnd() && xsr.readNext() != QXmlStreamReader::Invalid) { ... 0310 print(depth * " ", "if (xsr.tokenType() == QXmlStreamReader::Invalid)", sep="") 0311 print( 0312 (depth + 4) * " ", 0313 'qCWarning(LOG_KBIBTEX_NETWORKING) << "Invalid XML while parsing data at offset" << xsr.characterOffset() << ":" << xsr.errorString();', 0314 sep="", 0315 end="\n\n", 0316 ) 0317 depth -= 4 0318 print(depth * " ", "} else {", sep="") 0319 depth += 4 0320 print( 0321 depth * " ", 0322 'qCWarning(LOG_KBIBTEX_NETWORKING) << "Expected ', 0323 "'", 0324 entriesPath[0], 0325 "'", 0326 ', got" << xsr.qualifiedName() << "at XML line" << xsr.lineNumber() << ":" << xsr.errorString();', 0327 sep="", 0328 ) 0329 print(depth * " ", "*ok = false;", sep="") 0330 depth -= 4 0331 print(depth * " ", "}", sep="") 0332 depth -= 4 0333 print(depth * " ", "} else {", sep="") 0334 depth += 4 0335 print( 0336 depth * " ", 0337 'qCWarning(LOG_KBIBTEX_NETWORKING) << "Could not read start element at XML line" << xsr.lineNumber() << ":" << xsr.errorString();', 0338 sep="", 0339 ) 0340 print(depth * " ", "*ok = false;", sep="") 0341 depth -= 4 0342 print(depth * " ", "}", sep="") 0343 print("\n") 0344 0345 0346 # Read configuration file provided as the single argument to this Python script invocation 0347 with open(sys.argv[-1]) as input: 0348 for line in input: 0349 # Remove whitespace on right side 0350 line = line.rstrip() 0351 # Skip empty lines or comments 0352 if len(line) == 0 or line[0] == "#": 0353 continue 0354 # Maybe line can be split into key-value pairs? 0355 colonpos = line.find(": ") 0356 if len(line) > 1 and line[0] == " ": 0357 # Lines that start with spaces are continuations of a previous line, 0358 # so the value of this line is its content, stripped from surrounding spaces 0359 value = "\n" + line.strip() 0360 elif colonpos > 0: 0361 # So there is a separator string (': '), so split into key-value pair 0362 key = line[:colonpos].strip() 0363 value = line[colonpos + 2 :].strip() 0364 0365 # Update various global variables, depending on value read from line 0366 if key == "format": 0367 format += value 0368 elif key == "entries": 0369 entries += value 0370 elif key == "introduction": 0371 introduction += " " + value 0372 elif key == "entryvalidation": 0373 entryvalidation += value 0374 elif key == "postprocessingmapping": 0375 postprocessingmapping += value 0376 elif key == "postprocessingfields": 0377 postprocessingfields += value 0378 elif key == "entrytype": 0379 entrytype += value 0380 elif key == "entryid": 0381 entryid += value 0382 elif key.startswith("field["): 0383 fkey = key[6:-1] 0384 field.setdefault(fkey, "") 0385 field[fkey] += value 0386 elif key.startswith("valueItemType["): 0387 vitkey = key[14:-1] 0388 valueItemType.setdefault(vitkey, "") 0389 valueItemType[vitkey] += value 0390 elif key.startswith("value["): 0391 vitkey = key[6:-1] 0392 valuemap.setdefault(vitkey, "") 0393 valuemap[vitkey] += value 0394 0395 0396 if not format in {"xml", "json"}: 0397 raise ValueError("Missing or unsupported format: " + str(format)) 0398 0399 # Beginning of generated C++ code, hinting that the following code is generated by this script 0400 # and shall not be manually messed around 0401 print(" // Source code generated by Python script 'codegenerator-dataparser.py'") 0402 print(f" // using information from configuration file '{sys.argv[-1]}'", end="\n\n") 0403 0404 if format == "xml": 0405 xmlParser() 0406 elif format == "json": 0407 # TODO 0408 pass 0409 # jsonParser()