emojier/data/update-emoji-data.py

0001 #!/usr/bin/env python3
0002 #
0003 # Helper script to generate Emoji dictionary file from unicode data.
0004 #
0005 # SPDX-FileCopyrightText: 2022 Weng Xuetian <wengxt@gmail.com>
0006 # SPDX-License-Identifier: LGPL-2.0-or-later
0007 #
0008 import collections
0009 import requests
0010 import io
0011 import zipfile
0012 import os
0013 import glob
0014 import xml.dom.minidom
0015 from PyQt5.QtCore import QFile, QDataStream, QByteArray, QIODevice, qCompress
0016
0017 EMOJI_VERSION = "15.0"
0018 EMOJI_TEST_FILE = "emoji-test.txt"
0019 EMOJI_TEST_URL = f"http://www.unicode.org/Public/emoji/{EMOJI_VERSION}/{EMOJI_TEST_FILE}"
0020
0021 CLDR_VERSION = "43.0"
0022 CLDR_FILE = f"cldr-common-{CLDR_VERSION}.zip"
0023 CLDR_URL = f"https://unicode.org/Public/cldr/{CLDR_VERSION.split('.')[0]}/{CLDR_FILE}"
0024 CLDR_ANNOTATIONS_DIR = "common/annotations"
0025 CLDR_ANNOTATIONS_DERIVED_DIR = "common/annotationsDerived"
0026
0027 class EmojiAnnotation(object):
0028     def __init__(self):
0029         self.description = ""
0030         self.annotations = []
0031
0032 class EmojiParser(object):
0033     def __init__(self):
0034         self.variantMapping = dict()
0035         self.categoryNames = []
0036         self.emojis = collections.OrderedDict()
0037
0038     def parseEmojiTest(self, emojiTestData):
0039         descriptionMapping = dict()
0040         currentGroup = 0;
0041         GROUP_TAG = b"# group: "
0042         for line in emojiTestData.split(b"\n"):
0043             line = line.strip()
0044             if line.startswith(GROUP_TAG):
0045                 currentGroup += 1;
0046                 # "&" has special meaning in Qt, which does not work well in the UI.
0047                 self.categoryNames.append(line[len(GROUP_TAG):].replace(b" & ", b" and "))
0048                 continue
0049
0050             if line.startswith(b"#"):
0051                 continue;
0052
0053             # line format: code points; status # emoji name
0054             segments = line.split(b";")
0055             if len(segments) != 2:
0056                 continue
0057             metadata = segments[1].split(b"#")
0058             if len(metadata) != 2:
0059                 continue;
0060             desc = metadata[1].strip().split(b" E")
0061             if len(desc) != 2:
0062                 continue
0063             description = desc[1]
0064
0065             codes = segments[0].strip().split(b" ")
0066             try:
0067                 emoji = "".join(chr(int(code, 16)) for code in codes)
0068                 status = metadata[0].strip()
0069                 if status == b"fully-qualified":
0070                     self.emojis[emoji] = currentGroup
0071                     descriptionMapping[description] = emoji
0072                 else:
0073                     fullyQualified = descriptionMapping.get(description, None);
0074                     if fullyQualified:
0075                         self.variantMapping[emoji] = fullyQualified;
0076             except e:
0077                 pass
0078
0079     def parseCldr(self, cldrList):
0080         annotations = dict()
0081         for data in cldrList:
0082             with xml.dom.minidom.parseString(data) as doc:
0083                 annotationNodes = doc.getElementsByTagName("annotation")
0084                 for annotationNode in annotationNodes:
0085                     if "cp" not in annotationNode.attributes:
0086                         continue
0087                     emoji = annotationNode.attributes["cp"].nodeValue
0088                     if emoji not in self.emojis:
0089                         emoji = self.variantMapping.get(emoji, None)
0090                     if not emoji:
0091                         continue
0092                     if len(annotationNode.childNodes) != 1 or annotationNode.childNodes[0].nodeType != xml.dom.minidom.Node.TEXT_NODE:
0093                         continue
0094                     if emoji not in annotations:
0095                         annotations[emoji] = EmojiAnnotation()
0096                     annotation = annotations[emoji]
0097                     if "type" in annotationNode.attributes and annotationNode.attributes["type"].nodeValue == "tts":
0098                         annotation.description = annotationNode.childNodes[0].nodeValue
0099                     else:
0100                         annotation.annotations = annotationNode.childNodes[0].nodeValue.split(" | ")
0101         return annotations
0102
0103     def writeEmojiCategory(self):
0104         template_header='''/*
0105     SPDX-FileCopyrightText: 2022 Weng Xuetian <wegnxt@gmail.com>
0106
0107     SPDX-License-Identifier: LGPL-2.0-or-later
0108 */
0109 // Generated from emoji-test.txt
0110 #include "emojicategory.h"
0111
0112 #include <KLazyLocalizedString>
0113
0114 const QStringList &getCategoryNames()
0115 {
0116     static const QStringList names = {'''
0117         template_foot='''    };
0118     return names;
0119 }
0120 '''
0121         content = "\n".join([template_header] + [f"        QString::fromUtf8(kli18nc(\"Emoji Category\", \"{categoryName.decode('utf-8')}\").untranslatedText())," for categoryName in self.categoryNames] + [template_foot])
0122         with open("../emojicategory.cpp", "w") as f:
0123             f.write(content)
0124
0125 # Naive sanity check for running it under correct directory
0126 if not os.path.exists("../emojicategory.cpp"):
0127     print("Please run this script under its own directory")
0128     exit(1)
0129
0130 print("Removing old *.dict files")
0131 for olddict in glob.glob("*.dict"):
0132     os.remove(olddict)
0133
0134 parser = EmojiParser()
0135
0136 print(f"Downloading {EMOJI_TEST_URL}")
0137 response = requests.get(EMOJI_TEST_URL)
0138 print(f"Parsing {EMOJI_TEST_FILE}")
0139 parser.parseEmojiTest(response.content)
0140 parser.writeEmojiCategory()
0141
0142 print(f"Downloading {CLDR_URL}")
0143 response = requests.get(CLDR_URL)
0144
0145 with zipfile.ZipFile(io.BytesIO(response.content)) as thezip:
0146     annotationsFiles = set()
0147     annotationsDerivedFiles = set()
0148     for zipinfo in thezip.infolist():
0149         dirname = os.path.dirname(zipinfo.filename)
0150         basename = os.path.basename(zipinfo.filename)
0151         if not basename.endswith(".xml"):
0152             continue
0153         if dirname == CLDR_ANNOTATIONS_DIR:
0154             annotationsFiles.add(basename)
0155         elif dirname == CLDR_ANNOTATIONS_DERIVED_DIR:
0156             annotationsDerivedFiles.add(basename)
0157
0158     files = annotationsFiles.intersection(annotationsDerivedFiles)
0159     for langfile in files:
0160         dictfilename = langfile[:-4] + ".dict"
0161         print(f"Generating {dictfilename}")
0162         annotations = dict()
0163         with thezip.open(os.path.join(CLDR_ANNOTATIONS_DIR, langfile)) as annotationsFile, thezip.open(os.path.join(CLDR_ANNOTATIONS_DERIVED_DIR, langfile)) as annotationsDerivedFile:
0164             annotations = parser.parseCldr([annotationsFile.read(), annotationsDerivedFile.read()])
0165
0166         filtered_emojis = [(emoji, category) for (emoji, category) in parser.emojis.items() if emoji in annotations and annotations[emoji].description]
0167         # There's indeed some annotations file with 0 entries.
0168         if not filtered_emojis:
0169             print(f"Skipping {dictfilename}")
0170             continue
0171
0172         dictfile = QFile(dictfilename)
0173         if not dictfile.open(QIODevice.WriteOnly):
0174             continue
0175
0176         buf = QByteArray()
0177         stream = QDataStream(buf, QIODevice.WriteOnly)
0178         stream.setVersion(QDataStream.Qt_5_15)
0179         stream.setByteOrder(QDataStream.LittleEndian)
0180         stream.writeUInt32(len(filtered_emojis))
0181         for emoji, category in filtered_emojis:
0182             stream << QByteArray(emoji.encode("utf-8"))
0183             annotation = annotations[emoji]
0184             stream << QByteArray(annotation.description.encode("utf-8"))
0185             stream.writeInt32(category)
0186             # Write QList<QByteArray>
0187             stream.writeUInt32(len(annotation.annotations))
0188             for item in annotation.annotations:
0189                 stream << QByteArray(item.encode("utf-8"))
0190         compressed = qCompress(buf)
0191         dictfile.write(compressed)
0192         dictfile.close()
0193
0194 print("Update Finished, please also update CategoryAction.qml with new category name.")