File indexing completed on 2024-05-12 17:07:04
0001 #!/usr/bin/env python3 0002 # 0003 # Helper script to generate Emoji dictionary file from unicode data. 0004 # 0005 # SPDX-FileCopyrightText: 2022 Weng Xuetian <wengxt@gmail.com> 0006 # SPDX-License-Identifier: LGPL-2.0-or-later 0007 # 0008 import collections 0009 import requests 0010 import io 0011 import zipfile 0012 import os 0013 import glob 0014 import xml.dom.minidom 0015 from PyQt5.QtCore import QFile, QDataStream, QByteArray, QIODevice, qCompress 0016 0017 EMOJI_VERSION = "15.0" 0018 EMOJI_TEST_FILE = "emoji-test.txt" 0019 EMOJI_TEST_URL = f"http://www.unicode.org/Public/emoji/{EMOJI_VERSION}/{EMOJI_TEST_FILE}" 0020 0021 CLDR_VERSION = "43.0" 0022 CLDR_FILE = f"cldr-common-{CLDR_VERSION}.zip" 0023 CLDR_URL = f"https://unicode.org/Public/cldr/{CLDR_VERSION.split('.')[0]}/{CLDR_FILE}" 0024 CLDR_ANNOTATIONS_DIR = "common/annotations" 0025 CLDR_ANNOTATIONS_DERIVED_DIR = "common/annotationsDerived" 0026 0027 class EmojiAnnotation(object): 0028 def __init__(self): 0029 self.description = "" 0030 self.annotations = [] 0031 0032 class EmojiParser(object): 0033 def __init__(self): 0034 self.variantMapping = dict() 0035 self.categoryNames = [] 0036 self.emojis = collections.OrderedDict() 0037 0038 def parseEmojiTest(self, emojiTestData): 0039 descriptionMapping = dict() 0040 currentGroup = 0; 0041 GROUP_TAG = b"# group: " 0042 for line in emojiTestData.split(b"\n"): 0043 line = line.strip() 0044 if line.startswith(GROUP_TAG): 0045 currentGroup += 1; 0046 # "&" has special meaning in Qt, which does not work well in the UI. 0047 self.categoryNames.append(line[len(GROUP_TAG):].replace(b" & ", b" and ")) 0048 continue 0049 0050 if line.startswith(b"#"): 0051 continue; 0052 0053 # line format: code points; status # emoji name 0054 segments = line.split(b";") 0055 if len(segments) != 2: 0056 continue 0057 metadata = segments[1].split(b"#") 0058 if len(metadata) != 2: 0059 continue; 0060 desc = metadata[1].strip().split(b" E") 0061 if len(desc) != 2: 0062 continue 0063 description = desc[1] 0064 0065 codes = segments[0].strip().split(b" ") 0066 try: 0067 emoji = "".join(chr(int(code, 16)) for code in codes) 0068 status = metadata[0].strip() 0069 if status == b"fully-qualified": 0070 self.emojis[emoji] = currentGroup 0071 descriptionMapping[description] = emoji 0072 else: 0073 fullyQualified = descriptionMapping.get(description, None); 0074 if fullyQualified: 0075 self.variantMapping[emoji] = fullyQualified; 0076 except e: 0077 pass 0078 0079 def parseCldr(self, cldrList): 0080 annotations = dict() 0081 for data in cldrList: 0082 with xml.dom.minidom.parseString(data) as doc: 0083 annotationNodes = doc.getElementsByTagName("annotation") 0084 for annotationNode in annotationNodes: 0085 if "cp" not in annotationNode.attributes: 0086 continue 0087 emoji = annotationNode.attributes["cp"].nodeValue 0088 if emoji not in self.emojis: 0089 emoji = self.variantMapping.get(emoji, None) 0090 if not emoji: 0091 continue 0092 if len(annotationNode.childNodes) != 1 or annotationNode.childNodes[0].nodeType != xml.dom.minidom.Node.TEXT_NODE: 0093 continue 0094 if emoji not in annotations: 0095 annotations[emoji] = EmojiAnnotation() 0096 annotation = annotations[emoji] 0097 if "type" in annotationNode.attributes and annotationNode.attributes["type"].nodeValue == "tts": 0098 annotation.description = annotationNode.childNodes[0].nodeValue 0099 else: 0100 annotation.annotations = annotationNode.childNodes[0].nodeValue.split(" | ") 0101 return annotations 0102 0103 def writeEmojiCategory(self): 0104 template_header='''/* 0105 SPDX-FileCopyrightText: 2022 Weng Xuetian <wegnxt@gmail.com> 0106 0107 SPDX-License-Identifier: LGPL-2.0-or-later 0108 */ 0109 // Generated from emoji-test.txt 0110 #include "emojicategory.h" 0111 0112 #include <KLazyLocalizedString> 0113 0114 const QStringList &getCategoryNames() 0115 { 0116 static const QStringList names = {''' 0117 template_foot=''' }; 0118 return names; 0119 } 0120 ''' 0121 content = "\n".join([template_header] + [f" QString::fromUtf8(kli18nc(\"Emoji Category\", \"{categoryName.decode('utf-8')}\").untranslatedText())," for categoryName in self.categoryNames] + [template_foot]) 0122 with open("../emojicategory.cpp", "w") as f: 0123 f.write(content) 0124 0125 # Naive sanity check for running it under correct directory 0126 if not os.path.exists("../emojicategory.cpp"): 0127 print("Please run this script under its own directory") 0128 exit(1) 0129 0130 print("Removing old *.dict files") 0131 for olddict in glob.glob("*.dict"): 0132 os.remove(olddict) 0133 0134 parser = EmojiParser() 0135 0136 print(f"Downloading {EMOJI_TEST_URL}") 0137 response = requests.get(EMOJI_TEST_URL) 0138 print(f"Parsing {EMOJI_TEST_FILE}") 0139 parser.parseEmojiTest(response.content) 0140 parser.writeEmojiCategory() 0141 0142 print(f"Downloading {CLDR_URL}") 0143 response = requests.get(CLDR_URL) 0144 0145 with zipfile.ZipFile(io.BytesIO(response.content)) as thezip: 0146 annotationsFiles = set() 0147 annotationsDerivedFiles = set() 0148 for zipinfo in thezip.infolist(): 0149 dirname = os.path.dirname(zipinfo.filename) 0150 basename = os.path.basename(zipinfo.filename) 0151 if not basename.endswith(".xml"): 0152 continue 0153 if dirname == CLDR_ANNOTATIONS_DIR: 0154 annotationsFiles.add(basename) 0155 elif dirname == CLDR_ANNOTATIONS_DERIVED_DIR: 0156 annotationsDerivedFiles.add(basename) 0157 0158 files = annotationsFiles.intersection(annotationsDerivedFiles) 0159 for langfile in files: 0160 dictfilename = langfile[:-4] + ".dict" 0161 print(f"Generating {dictfilename}") 0162 annotations = dict() 0163 with thezip.open(os.path.join(CLDR_ANNOTATIONS_DIR, langfile)) as annotationsFile, thezip.open(os.path.join(CLDR_ANNOTATIONS_DERIVED_DIR, langfile)) as annotationsDerivedFile: 0164 annotations = parser.parseCldr([annotationsFile.read(), annotationsDerivedFile.read()]) 0165 0166 filtered_emojis = [(emoji, category) for (emoji, category) in parser.emojis.items() if emoji in annotations and annotations[emoji].description] 0167 # There's indeed some annotations file with 0 entries. 0168 if not filtered_emojis: 0169 print(f"Skipping {dictfilename}") 0170 continue 0171 0172 dictfile = QFile(dictfilename) 0173 if not dictfile.open(QIODevice.WriteOnly): 0174 continue 0175 0176 buf = QByteArray() 0177 stream = QDataStream(buf, QIODevice.WriteOnly) 0178 stream.setVersion(QDataStream.Qt_5_15) 0179 stream.setByteOrder(QDataStream.LittleEndian) 0180 stream.writeUInt32(len(filtered_emojis)) 0181 for emoji, category in filtered_emojis: 0182 stream << QByteArray(emoji.encode("utf-8")) 0183 annotation = annotations[emoji] 0184 stream << QByteArray(annotation.description.encode("utf-8")) 0185 stream.writeInt32(category) 0186 # Write QList<QByteArray> 0187 stream.writeUInt32(len(annotation.annotations)) 0188 for item in annotation.annotations: 0189 stream << QByteArray(item.encode("utf-8")) 0190 compressed = qCompress(buf) 0191 dictfile.write(compressed) 0192 dictfile.close() 0193 0194 print("Update Finished, please also update CategoryAction.qml with new category name.")