python/comics_project_management_tools/comics_project_translation_scraper.py

0001 """
0002 SPDX-FileCopyrightText: 2018 Wolthera van Hövell tot Westerflier <griffinvalley@gmail.com>
0003
0004 This file is part of the Comics Project Management Tools(CPMT).
0005
0006 SPDX-License-Identifier: GPL-3.0-or-later
0007 """
0008
0009 """
0010 A class for getting translatable strings out.
0011
0012 This class does several things:
0013 1) It can parse through kra files' document.xml, and then through the svgs that file is pointing at.
0014 2) It can parse a preexisting POT file to ensure it isn't making duplicates.
0015 3) It can write a POT file.
0016 4) Writing to a csv file was considered until the realisation hit that comic dialog itself contains commas.
0017 """
0018
0019 import sys
0020 import os
0021 import csv
0022 import zipfile
0023 import types
0024 from xml.dom import minidom
0025 from PyQt5.QtCore import QDateTime, Qt
0026
0027
0028 class translation_scraper():
0029     projectURL = str()
0030     translation_folder = str()
0031     textLayerNameList = []
0032     translationDict = {}
0033     translationKeys = []  # separate so that the keys will be somewhat according to the order of appearance.
0034     pageTitleKeys= []
0035     projectName = str()
0036     languageKey = "AA_language"
0037
0038     def __init__(self, projectURL=str(), translation_folder=str(), textLayerNameList=[], projectName=str()):
0039         self.projectURL = projectURL
0040         self.projectName = projectName
0041         self.translation_folder = translation_folder
0042         self.textLayerNameList = textLayerNameList
0043         self.translationDict = {}
0044         self.pageTitleKeys = []
0045
0046         # Check for a preexisting translation file and parse that.
0047         for entry in os.scandir(os.path.join(self.projectURL, self.translation_folder)):
0048             if entry.name.endswith(projectName + '.pot') and entry.is_file():
0049                 self.parse_pot(os.path.join(self.projectURL, self.translation_folder, entry.name))
0050                 break
0051
0052     def start(self, pagesList, language, metaData={}):
0053         if self.languageKey not in self.translationDict.keys():
0054             self.translationDict[self.languageKey] = language
0055         for p in pagesList:
0056             self.get_svg_layers(os.path.join(self.projectURL, p))
0057         self.write_pot(metaData)
0058
0059     def parse_pot(self, location):
0060         if (os.path.exists(location)):
0061             file = open(location, "r", newline="", encoding="utf8")
0062             multiLine = ""
0063             key = None
0064             entry = {}
0065
0066             def addEntryToTranslationDict(key, entry):
0067                 if len(entry.keys()) > 0:
0068                     if key is None:
0069                         key = entry.get("text", None)
0070                     if key is not None:
0071                         if len(key) > 0:
0072                             self.translationDict[key] = entry
0073
0074             for line in file or len(line) < 1:
0075                 if line.isspace():
0076                     addEntryToTranslationDict(key, entry)
0077                     entry = {}
0078                     key = None
0079                     multiLine = ""
0080                 if line.startswith("msgid "):
0081                     string = line.strip("msgid \"")
0082                     string = string[:-len('"\n')]
0083                     string = string.replace("\\\"", "\"")
0084                     string = string.replace("\\\'", "\'")
0085                     string = string.replace("\\#", "#")
0086                     entry["text"] = string
0087                     multiLine = "text"
0088                 if line.startswith("msgstr "):
0089                     string = line.strip("msgstr \"")
0090                     string = string[:-len('"\n')]
0091                     string = string.replace("\\\"", "\"")
0092                     string = string.replace("\\\'", "\'")
0093                     string = string.replace("\\#", "#")
0094                     entry["trans"] = string
0095                     multiLine = "trans"
0096                 if line.startswith("# "):
0097                     # Translator comment
0098                     entry["translator"] = line
0099                 if line.startswith("#. "):
0100                     entry["extract"] = line
0101                 if line.startswith("msgctxt "):
0102                     string = line.strip("msgctxt \"")
0103                     string = string[:-len('"\n')]
0104                     string = string.replace("\\\"", "\"")
0105                     string = string.replace("\\\'", "\'")
0106                     string = string.replace("\\#", "#")
0107                     key = string
0108                 if line.startswith("\"") and len(multiLine) > 0:
0109                     string = line[1:]
0110                     string = string[:-len('"\n')]
0111                     string = string.replace("\\\"", "\"")
0112                     string = string.replace("\\\'", "\'")
0113                     string = string.replace("\\#", "#")
0114                     entry[multiLine] += string
0115             addEntryToTranslationDict(key, entry)
0116             file.close()
0117
0118     def get_svg_layers(self, location):
0119         page = zipfile.ZipFile(location, "a")
0120         xmlroot = minidom.parseString(page.read("maindoc.xml"))
0121         doc = xmlroot.documentElement
0122
0123         candidates = []
0124
0125         for member in page.namelist():
0126             info = page.getinfo(member)
0127             if info.filename.endswith('svg'):
0128                 candidates.append(info.filename)
0129
0130         def parseThroughChildNodes(node):
0131             for childNode in node.childNodes:
0132                 if childNode.nodeType != minidom.Node.TEXT_NODE:
0133                     if childNode.tagName == "layer" and childNode.getAttribute("nodetype") == "shapelayer":
0134                         isTextLayer = False
0135                         for t in self.textLayerNameList:
0136                             if t in childNode.getAttribute("name"):
0137                                 isTextLayer = True
0138                         if isTextLayer:
0139                             filename = childNode.getAttribute("filename")
0140                             for c in candidates:
0141                                 if str(filename + ".shapelayer/content.svg") in c:
0142                                     self.get_txt(page.read(c))
0143                     if childNode.childNodes:
0144                         parseThroughChildNodes(childNode)
0145
0146         parseThroughChildNodes(doc)
0147
0148         # Get page title if the keywords contain acbf_title
0149         xmlroot = minidom.parseString(page.read("documentinfo.xml"))
0150         dict = {}
0151         def parseThroughDocumentInfo(node, dict):
0152             for childNode in node.childNodes:
0153                 if childNode.nodeType != minidom.Node.TEXT_NODE and childNode.nodeType != minidom.Node.CDATA_SECTION_NODE:
0154                     if childNode.tagName == "title":
0155                         title = ""
0156                         for text in childNode.childNodes:
0157                             title += text.data
0158                         dict["title"] = title
0159                     elif childNode.tagName == "keyword":
0160                         k = ""
0161                         for text in childNode.childNodes:
0162                             k += text.data
0163                         keywords = k.split(",")
0164                         for i in range(len(keywords)):
0165                             keywords[i] = str(keywords[i]).strip()
0166                         dict["key"] = keywords
0167                     if childNode.childNodes:
0168                         parseThroughDocumentInfo(childNode, dict)
0169
0170         parseThroughDocumentInfo(xmlroot.documentElement, dict)
0171         keywords = dict["key"]
0172         if "acbf_title" in keywords:
0173             self.pageTitleKeys.append(dict["title"])
0174
0175         page.close()
0176
0177     def get_txt(self, string):
0178         svg = minidom.parseString(string)
0179         # parse through string as if svg.
0180
0181         def parseThroughChildNodes(node):
0182             for childNode in node.childNodes:
0183                 if childNode.nodeType != minidom.Node.TEXT_NODE:
0184                     if childNode.tagName == "text":
0185                         text = ""
0186                         for c in childNode.childNodes:
0187                             text += c.toxml()
0188                         if text not in self.translationDict.keys():
0189                             entry = {}
0190                             entry["text"] = text
0191                             self.translationDict[text] = entry
0192                         if text not in self.translationKeys:
0193                             self.translationKeys.append(text)
0194                     elif childNode.childNodes:
0195                         parseThroughChildNodes(childNode)
0196
0197         parseThroughChildNodes(svg.documentElement)
0198
0199     def write_pot(self, metaData):
0200         quote = "\""
0201         newLine = "\n"
0202         location = os.path.join(self.projectURL, self.translation_folder, self.projectName + ".pot")
0203         file = open(location, "w", newline="", encoding="utf8")
0204
0205         file.write("msgid " + quote + quote + newLine)
0206         file.write("msgstr " + quote + quote + newLine)
0207         date = QDateTime.currentDateTimeUtc().toString(Qt.ISODate)
0208         file.write(quote + "POT-Creation-Date:" + date + "\\n" + quote + newLine)
0209         file.write(quote + "Content-Type: text/plain; charset=UTF-8\\n" + quote + newLine)
0210         file.write(quote + "Content-Transfer-Encoding: 8bit\\n" + quote + newLine)
0211         file.write(quote + "X-Generator: Krita Comics Project Manager Tools Plugin\\n" + quote + newLine)
0212
0213         file.write(newLine)
0214         file.write("#. Title of the work" + newLine)
0215         file.write("msgctxt \"@meta-title\"" + newLine)
0216         file.write("msgid " + quote + metaData.get("title", "") + quote + newLine)
0217         file.write("msgstr " + quote + quote + newLine)
0218         file.write(newLine)
0219
0220         file.write("#. The summary" + newLine)
0221         file.write("msgctxt \"@meta-summary\"" + newLine)
0222         file.write("msgid " + quote + metaData.get("summary", "") + quote + newLine)
0223         file.write("msgstr " + quote + quote + newLine)
0224         file.write(newLine)
0225
0226         file.write("#. The keywords, these need to be comma separated." + newLine)
0227         file.write("msgctxt \"@meta-keywords\"" + newLine)
0228         file.write("msgid " + quote + metaData.get("keywords", "") + quote + newLine)
0229         file.write("msgstr " + quote + quote + newLine)
0230         file.write(newLine)
0231
0232         file.write("#. The header that will prepend translator's notes" + newLine)
0233         file.write("msgctxt \"@meta-translator\"" + newLine)
0234         file.write("msgid " + quote + metaData.get("transnotes", "") + quote + newLine)
0235         file.write("msgstr " + quote + quote + newLine)
0236
0237         for i in range(len(self.pageTitleKeys)):
0238             title = self.pageTitleKeys[i]
0239             file.write(newLine)
0240             file.write("msgctxt " + quote + "@page-title" + quote + newLine)
0241             file.write("msgid " + quote + title + quote + newLine)
0242             file.write("msgstr " + quote + quote + newLine)
0243
0244         for key in self.translationKeys:
0245             if key != self.languageKey:
0246                 file.write(newLine)
0247                 if "translComment" in self.translationDict[key].keys():
0248                     file.write("# " + self.translationDict[key]["translator"] + newLine)
0249                 if "extract" in self.translationDict[key].keys():
0250                     file.write("#. " + self.translationDict[key]["extract"] + newLine)
0251                 string = self.translationDict[key]["text"]
0252                 uniqueContext = False
0253                 if string != key:
0254                     uniqueContext = True
0255                 string = string.replace(quote, "\\\"")
0256                 string = string.replace("\'", "\\\'")
0257                 string = string.replace("#", "\\#")
0258                 if uniqueContext:
0259                     file.write("msgctxt " + quote + key + quote + newLine)
0260                 file.write("msgid " + quote + string + quote + newLine)
0261                 file.write("msgstr " + quote + quote + newLine)
0262         file.close()
0263         print("CPMT: Translations have been written to:", location)