kwidgetsaddons/src/kcharselect-generate-datafile.py

0001 #!/usr/bin/python3
0002 # -*- coding: utf-8 -*-
0003 #
0004 # This script generates a data file containing all Unicode information needed
0005 # by KCharSelect.
0006 #
0007 ##############################################################################
0008 # SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
0009 # SPDX-FileCopyrightText: 2016 John Zaitseff <J.Zaitseff@zap.org.au>
0010 #
0011 # SPDX-License-Identifier: LGPL-2.0-or-later
0012 ##############################################################################
0013 #
0014 # The current directory must contain the following files that can be found at
0015 # http://www.unicode.org/Public/UNIDATA/:
0016 # - UnicodeData.txt
0017 # - Unihan_Readings.txt (you need to uncompress it from Unihan.zip)
0018 # - NamesList.txt
0019 # - Blocks.txt
0020 #
0021 # The generated file is named "kcharselect-data" and has to be put in
0022 # kwidgetsaddons/src.  Additionally a translation dummy named
0023 # "kcharselect-translation.cpp" is generated and has to be placed in the same
0024 # directory.
0025 #
0026 # FILE STRUCTURE
0027 #
0028 # The generated file is a binary file. The first 40 bytes are the header and
0029 # contain the position of each part of the file. Each entry is uint32.
0030 #
0031 # pos   content
0032 # 0     names strings begin
0033 # 4     names offsets begin
0034 # 8     details strings begin
0035 # 12    details offsets begin
0036 # 16    block strings begin
0037 # 20    block offsets begin
0038 # 24    section strings begin
0039 # 28    section offsets begin
0040 # 32    unihan strings begin
0041 # 36    unihan offsets begin
0042 #
0043 # The string parts always contain all strings in a row, followed by a 0x00
0044 # byte.  There is one exception: The data for seeAlso in details is only 2
0045 # bytes (as is always is _one_ unicode character) and _not_ followed by a 0x00
0046 # byte.
0047 #
0048 # The offset parts contain entries with a fixed length.  Unicode characters
0049 # are always uint16 and offsets uint32.  Offsets are positions in the data
0050 # file.
0051 #
0052 # names_offsets:
0053 # each entry 6 bytes
0054 # 16bit: unicode
0055 # 32bit: offset to name in names_strings
0056 #
0057 # names_strings:
0058 # the first byte is the category (same values as QChar::Category),
0059 # directly followed by the character name (terminated by 0x00)
0060 #
0061 # nameslist_offsets:
0062 # char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_coutn, equiv, equiv_count, seeAlso, seeAlso_count
0063 # 16    32     8            32    8           32           8                  32     8            32       8
0064 # => each entry 27 bytes
0065 #
0066 # blocks_offsets:
0067 # each entry 4 bytes
0068 # 16bit: start unicode
0069 # 16bit: end unicode
0070 # Note that there is no string offset.
0071 #
0072 # section_offsets:
0073 # each entry 4 bytes
0074 # 16bit: section offset
0075 # 16bit: block offset
0076 # Note that these offsets are _not_ positions in the data file but indexes.
0077 # For example 0x0403 means the fourth section includes the third block.
0078 #
0079 # unihan_offsets:
0080 # each entry 30 bytes
0081 # 16bit: unicode
0082 # 32bit: offset to unihan_strings for Definition
0083 # 32bit: offset to unihan_strings for Cantonese
0084 # 32bit: offset to unihan_strings for Mandarin
0085 # 32bit: offset to unihan_strings for Tang
0086 # 32bit: offset to unihan_strings for Korean
0087 # 32bit: offset to unihan_strings for JapaneseKun
0088 # 32bit: offset to unihan_strings for JapaneseOn
0089
0090 from struct import *
0091 import sys
0092 import re
0093 import io
0094
0095 # Based on http://www.unicode.org/charts/, updated for Unicode 9.0
0096 sectiondata = '''
0097 SECTION European Scripts
0098 Basic Latin
0099 Latin-1 Supplement
0100 Latin Extended-A
0101 Latin Extended-B
0102 Latin Extended-C
0103 Latin Extended-D
0104 Latin Extended-E
0105 Latin Extended Additional
0106 Armenian
0107 Coptic
0108 Cyrillic
0109 Cyrillic Supplement
0110 Cyrillic Extended-A
0111 Cyrillic Extended-B
0112 Cyrillic Extended-C
0113 Georgian
0114 Georgian Supplement
0115 Georgian Extended
0116 Glagolitic
0117 Greek and Coptic
0118 Greek Extended
0119 Ogham
0120 Runic
0121
0122 SECTION African Scripts
0123 Bamum
0124 Ethiopic
0125 Ethiopic Supplement
0126 Ethiopic Extended
0127 Ethiopic Extended-A
0128 NKo
0129 Tifinagh
0130 Vai
0131
0132 SECTION Middle Eastern Scripts
0133 Arabic
0134 Arabic Supplement
0135 Arabic Extended-A
0136 Arabic Extended-B
0137 Arabic Presentation Forms-A
0138 Arabic Presentation Forms-B
0139 Hebrew
0140 Mandaic
0141 Samaritan
0142 Syriac
0143 Syriac Supplement
0144
0145 SECTION Central Asian Scripts
0146 Mongolian
0147 Phags-pa
0148 Tibetan
0149
0150 SECTION South Asian Scripts
0151 Bengali
0152 Common Indic Number Forms
0153 Devanagari
0154 Devanagari Extended
0155 Gujarati
0156 Gurmukhi
0157 Kannada
0158 Lepcha
0159 Limbu
0160 Malayalam
0161 Meetei Mayek
0162 Meetei Mayek Extensions
0163 Ol Chiki
0164 Oriya
0165 Saurashtra
0166 Sinhala
0167 Syloti Nagri
0168 Tamil
0169 Telugu
0170 Thaana
0171 Vedic Extensions
0172
0173 SECTION Southeast Asian Scripts
0174 Cham
0175 Kayah Li
0176 Khmer
0177 Khmer Symbols
0178 Lao
0179 Myanmar
0180 Myanmar Extended-A
0181 Myanmar Extended-B
0182 New Tai Lue
0183 Tai Le
0184 Tai Tham
0185 Tai Viet
0186 Thai
0187
0188 SECTION Indonesia and Oceania Scripts
0189 Balinese
0190 Batak
0191 Buginese
0192 Buhid
0193 Hanunoo
0194 Javanese
0195 Rejang
0196 Sundanese
0197 Sundanese Supplement
0198 Tagalog
0199 Tagbanwa
0200
0201 SECTION East Asian Scripts
0202 Bopomofo
0203 Bopomofo Extended
0204 CJK Unified Ideographs
0205 CJK Unified Ideographs Extension A
0206 CJK Compatibility
0207 CJK Compatibility Ideographs
0208 CJK Compatibility Forms
0209 CJK Radicals Supplement
0210 CJK Strokes
0211 CJK Symbols and Punctuation
0212 Enclosed CJK Letters and Months
0213 Hangul Jamo
0214 Hangul Jamo Extended-A
0215 Hangul Jamo Extended-B
0216 Hangul Compatibility Jamo
0217 Hangul Syllables
0218 Hiragana
0219 Ideographic Description Characters
0220 Kanbun
0221 Kangxi Radicals
0222 Katakana
0223 Katakana Phonetic Extensions
0224 Lisu
0225 Yi Radicals
0226 Yi Syllables
0227
0228 SECTION American Scripts
0229 Cherokee
0230 Cherokee Supplement
0231 Unified Canadian Aboriginal Syllabics
0232 Unified Canadian Aboriginal Syllabics Extended
0233
0234 SECTION Symbols
0235 General Punctuation
0236 Alchemical Symbols
0237 Braille Patterns
0238 Chess Symbols
0239 Control Pictures
0240 Currency Symbols
0241 Dingbats
0242 Domino Tiles
0243 Emoticons
0244 Enclosed Alphanumerics
0245 Enclosed Alphanumeric Supplement
0246 Enclosed Ideographic Supplement
0247 Mahjong Tiles
0248 Miscellaneous Symbols
0249 Miscellaneous Symbols and Pictographs
0250 Miscellaneous Technical
0251 Optical Character Recognition
0252 Ornamental Dingbats
0253 Playing Cards
0254 Small Form Variants
0255 Supplemental Punctuation
0256 Supplemental Symbols and Pictographs
0257 Symbols and Pictographs Extended-A
0258 Symbols for Legacy Computing
0259 Transport and Map Symbols
0260 Vertical Forms
0261 Yijing Hexagram Symbols
0262
0263 SECTION Mathematical Symbols
0264 Arrows
0265 Block Elements
0266 Box Drawing
0267 Geometric Shapes
0268 Geometric Shapes Extended
0269 Letterlike Symbols
0270 Mathematical Operators
0271 Miscellaneous Mathematical Symbols-A
0272 Miscellaneous Mathematical Symbols-B
0273 Miscellaneous Symbols and Arrows
0274 Number Forms
0275 Superscripts and Subscripts
0276 Supplemental Arrows-A
0277 Supplemental Arrows-B
0278 Supplemental Arrows-C
0279 Supplemental Mathematical Operators
0280
0281 SECTION Phonetic Symbols
0282 IPA Extensions
0283 Modifier Tone Letters
0284 Phonetic Extensions
0285 Phonetic Extensions Supplement
0286 Spacing Modifier Letters
0287
0288 SECTION Combining Diacritics
0289 Combining Diacritical Marks
0290 Combining Diacritical Marks Extended
0291 Combining Diacritical Marks Supplement
0292 Combining Diacritical Marks for Symbols
0293 Combining Half Marks
0294
0295 SECTION Other
0296 Alphabetic Presentation Forms
0297 Halfwidth and Fullwidth Forms
0298 High Private Use Surrogates
0299 High Surrogates
0300 Low Surrogates
0301 Private Use Area
0302 Specials
0303 Variation Selectors
0304 '''
0305
0306 categoryMap = { # same values as QChar::Category
0307     "Mn": 1,
0308     "Mc": 2,
0309     "Me": 3,
0310     "Nd": 4,
0311     "Nl": 5,
0312     "No": 6,
0313     "Zs": 7,
0314     "Zl": 8,
0315     "Zp": 9,
0316     "Cc": 10,
0317     "Cf": 11,
0318     "Cs": 12,
0319     "Co": 13,
0320     "Cn": 14,
0321     "Lu":  15,
0322     "Ll":  16,
0323     "Lt":  17,
0324     "Lm":  18,
0325     "Lo":  19,
0326     "Pc":  20,
0327     "Pd":  21,
0328     "Ps":  22,
0329     "Pe":  23,
0330     "Pi":  24,
0331     "Pf":  25,
0332     "Po":  26,
0333     "Sm":  27,
0334     "Sc":  28,
0335     "Sk":  29,
0336     "So":  30
0337 }
0338
0339
0340 # Temporary code point remapping
0341 #
0342 # Initial SMP support without needing a new data file format
0343 # - BMP U+Fxxx are remapped to U+Exxx
0344 # - SMP symbols U+1Fxxx are remapped to U+Fxxx
0345 # - Private Use Area is limited to U+F000 ... U+F8FF
0346
0347 def remap(char):
0348     cp = int(char, 16)
0349     if cp >= 0xE000 and cp <= 0xFFFF:
0350         return "E"+char[1:]
0351     if cp >= 0x1F000 and cp <= 0x1FFFF:
0352         return char[1:]
0353     return char
0354
0355 class Names:
0356     def __init__(self):
0357         self.names = []
0358         self.controlpos = -1
0359     def addName(self, uni, name, category):
0360         self.names.append([uni, name, category])
0361
0362     def calculateStringSize(self):
0363         size = 0
0364         hadcontrol = False
0365         for entry in self.names:
0366             if entry[1] == "<control>":
0367                 if not hadcontrol:
0368                     size += len(entry[1]) + 2
0369                     hadcontrol = True
0370             else:
0371                 size += len(entry[1]) + 2
0372         return size
0373
0374     def calculateOffsetSize(self):
0375         return len(self.names)*6
0376
0377     def writeStrings(self, out, pos):
0378         hadcontrol = False
0379         for entry in self.names:
0380             if entry[1] == "<control>":
0381                 if not hadcontrol:
0382                     out.write(pack("=b", entry[2]))
0383                     out.write(entry[1].encode("utf-8") + b"\0")
0384                     size = len(entry[1]) + 2
0385                     entry[1] = pos
0386                     self.controlpos = pos
0387                     pos += size
0388                     hadcontrol = True
0389                 else:
0390                     entry[1] = self.controlpos
0391             else:
0392                 out.write(pack("=b", entry[2]))
0393                 out.write(entry[1].encode("utf-8") + b"\0")
0394                 size = len(entry[1]) + 2
0395                 entry[1] = pos
0396                 pos += size
0397         return pos
0398
0399     def writeOffsets(self, out, pos):
0400         for entry in self.names:
0401             out.write(pack("=HI", int(entry[0], 16), entry[1]))
0402             pos += 6
0403         return pos
0404
0405 class Details:
0406     def __init__(self):
0407         self.details = {}
0408     def addEntry(self, char, category, text):
0409         if not char in self.details:
0410             self.details[char] = {}
0411         if not category in self.details[char]:
0412             self.details[char][category] = []
0413         self.details[char][category].append(text)
0414
0415     def calculateStringSize(self):
0416         size = 0
0417         for char in self.details.values():
0418             for cat in char.values():
0419                 for s in cat:
0420                     if type(s) is str:
0421                         size += len(s.encode("utf-8")) + 1
0422                     else:
0423                         size += 2
0424         return size
0425
0426     def calculateOffsetSize(self):
0427         return len(self.details)*27
0428
0429     def writeStrings(self, out, pos):
0430         for char in self.details.values():
0431             for cat in char.values():
0432                 for i in range(0, len(cat)):
0433                     s = cat[i]
0434                     if type(s) is str:
0435                         out.write(s.encode("utf-8") + b"\0")
0436                         size = len(s.encode("utf-8")) + 1
0437                     else:
0438                         out.write(pack("=H", s))
0439                         size = 2
0440                     cat[i] = pos
0441                     pos += size
0442         return pos
0443
0444     def writeOffsets(self, out, pos):
0445         for char in self.details.keys():
0446             alias = 0
0447             alias_count = 0
0448             note = 0
0449             note_count = 0
0450             approxEquiv = 0
0451             approxEquiv_count = 0
0452             equiv = 0
0453             equiv_count = 0
0454             seeAlso = 0
0455             seeAlso_count = 0
0456             if "alias" in self.details[char]:
0457                 alias = self.details[char]["alias"][0]
0458                 alias_count = len(self.details[char]["alias"])
0459
0460             if "note" in self.details[char]:
0461                 note = self.details[char]["note"][0]
0462                 note_count = len(self.details[char]["note"])
0463
0464             if "approxEquiv" in self.details[char]:
0465                 approxEquiv = self.details[char]["approxEquiv"][0]
0466                 approxEquiv_count = len(self.details[char]["approxEquiv"])
0467
0468             if "equiv" in self.details[char]:
0469                 equiv = self.details[char]["equiv"][0]
0470                 equiv_count = len(self.details[char]["equiv"])
0471
0472             if "seeAlso" in self.details[char]:
0473                 seeAlso = self.details[char]["seeAlso"][0]
0474                 seeAlso_count = len(self.details[char]["seeAlso"])
0475
0476             out.write(pack("=HIbIbIbIbIb", char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_count, equiv, equiv_count, seeAlso, seeAlso_count))
0477             pos += 27
0478
0479         return pos
0480
0481 class SectionsBlocks:
0482     def __init__(self):
0483         self.sections = []
0484         self.blocks = []
0485         self.blockList = []
0486         self.sectionList = []
0487
0488     def addBlock(self, begin, end, name):
0489         self.blocks.append([begin, end, name])
0490         self.blockList.append(name)
0491
0492     def addSection(self, section, block):
0493         self.sections.append([section, block])
0494         if not section in self.sectionList:
0495             self.sectionList.append(section)
0496
0497     def calculateBlockStringSize(self):
0498         size = 0
0499         for block in self.blocks:
0500             size += len(block[2]) + 1
0501         return size
0502
0503     def calculateBlockOffsetSize(self):
0504         return len(self.blocks) * 4
0505
0506     def calculateSectionStringSize(self):
0507         size = 0
0508         lastsection = ""
0509         for section in self.sections:
0510             if section[0] != lastsection:
0511                 size += len(section[0]) + 1
0512                 lastsection = section[0]
0513         return size
0514
0515     def calculateSectionOffsetSize(self):
0516         return len(self.sections) * 4
0517
0518     def writeBlockStrings(self, out, pos):
0519         index = 0
0520         for block in self.blocks:
0521             out.write(block[2].encode("utf-8") + b"\0")
0522             size = len(block[2].encode("utf-8")) + 1
0523             found = False
0524             for section in self.sections:
0525                 if section[1] == block[2]:
0526                     print("found", section)
0527                     section[1] = index
0528                     found = True
0529             if not found:
0530                 print("Error: Did not find any category for block \""+block[2]+"\"")
0531                 sys.exit(1)
0532             block[2] = index
0533             pos += size
0534             index += 1
0535         return pos
0536
0537     def writeBlockOffsets(self, out, pos):
0538         for block in self.blocks:
0539             out.write(pack("=HH", int(block[0], 16), int(block[1], 16)))
0540             pos += 4
0541         return pos
0542
0543     def writeSectionStrings(self, out, pos):
0544         lastsection = ""
0545         lastpos = 0
0546         index = -1
0547         for section in self.sections:
0548             if section[0] != lastsection:
0549                 index += 1
0550                 lastsection = section[0]
0551                 out.write(section[0].encode("utf-8") + b"\0")
0552                 size = len(section[0].encode("utf-8")) + 1
0553                 section[0] = index
0554                 lastpos = pos
0555                 pos += size
0556             else:
0557                 section[0] = index
0558         return pos
0559
0560     def writeSectionOffsets(self, out, pos):
0561         for section in self.sections:
0562             out.write(pack("=HH", section[0], section[1]))
0563             pos += 4
0564         return pos
0565
0566     def getBlockList(self):
0567         return self.blockList
0568
0569     def getSectionList(self):
0570         return self.sectionList
0571
0572 class Unihan:
0573     def __init__(self):
0574         self.unihan = {}
0575
0576     def addUnihan(self, uni, category, value):
0577         uni = int(uni, 16)
0578         if category != "kDefinition" and category != "kCantonese" and category != "kMandarin" and category != "kTang" and category != "kKorean" and category != "kJapaneseKun" and category != "kJapaneseOn":
0579             return
0580         if not uni in self.unihan:
0581             self.unihan[uni] = [None, None, None, None, None, None, None]
0582         if category == "kDefinition":
0583             self.unihan[uni][0] = value
0584         elif category == "kCantonese":
0585             self.unihan[uni][1] = value
0586         elif category == "kMandarin":
0587             self.unihan[uni][2] = value
0588         elif category == "kTang":
0589             self.unihan[uni][3] = value
0590         elif category == "kKorean":
0591             self.unihan[uni][4] = value
0592         elif category == "kJapaneseKun":
0593             self.unihan[uni][5] = value
0594         elif category == "kJapaneseOn":
0595             self.unihan[uni][6] = value
0596
0597     def calculateStringSize(self):
0598         size = 0
0599         for char in self.unihan.keys():
0600             for entry in self.unihan[char]:
0601                 if entry != None:
0602                     size += len(entry.encode("utf-8")) + 1
0603         return size
0604
0605     def calculateOffsetSize(self):
0606         return len(self.unihan) * 30
0607
0608     def writeStrings(self, out, pos):
0609         for char in self.unihan.keys():
0610             for i in range(0, 7):
0611                 if self.unihan[char][i] != None:
0612                     out.write(self.unihan[char][i].encode("utf-8") + b"\0")
0613                     size = len(self.unihan[char][i].encode("utf-8")) + 1
0614                     self.unihan[char][i] = pos
0615                     pos += size
0616         return pos
0617
0618     def writeOffsets(self, out, pos):
0619         for char in self.unihan.keys():
0620             out.write(pack("=H", char))
0621             for i in range(0, 7):
0622                 if self.unihan[char][i] != None:
0623                     out.write(pack("=I", self.unihan[char][i]))
0624                 else:
0625                     out.write(pack("=I", 0))
0626             pos += 30
0627         return pos
0628
0629 class Parser:
0630     def parseUnicodeData(self, inUnicodeData, names):
0631         regexp = re.compile(r'^([^;]+);([^;]+);([^;]+)')
0632         for line in inUnicodeData:
0633             line = line[:-1]
0634             m = regexp.match(line)
0635             if not m:
0636                 continue
0637             uni = remap(m.group(1))
0638             name = m.group(2)
0639             category = m.group(3)
0640             if len(uni) > 4:
0641                 continue
0642             names.addName(uni, name, categoryMap[category])
0643
0644     def parseDetails(self, inNamesList, details):
0645         invalidRegexp = re.compile(r'^@')
0646         unicodeRegexp = re.compile(r'^([0-9A-F]+)')
0647
0648         aliasRegexp = re.compile(r'^\s+=\s+(.+)$') #equal
0649         seeAlsoRegexp1 = re.compile(r'^\s+x\s+.*\s([0-9A-F]{4,6})\)$') #ex
0650         seeAlsoRegexp2 = re.compile(r'^\s+x\s+([0-9A-F]{4,6})$') #ex
0651         noteRegexp = re.compile(r'^\s+\*\s+(.+)$') #star
0652         approxEquivalentRegexp = re.compile(r'^\s+#\s+(.+)$') #pound
0653         equivalentRegexp = re.compile(r'^\s+:\s+(.+)$') #colon
0654
0655         drop = 0
0656         currChar = 0
0657
0658         for line in inNamesList:
0659             line = line[:-1]
0660             m1 = unicodeRegexp.match(line)
0661             m2 = aliasRegexp.match(line)
0662             m3 = noteRegexp.match(line)
0663             m4 = approxEquivalentRegexp.match(line)
0664             m5 = equivalentRegexp.match(line)
0665             m6 = seeAlsoRegexp1.match(line)
0666             m7 = seeAlsoRegexp2.match(line)
0667             if invalidRegexp.match(line):
0668                 continue
0669             elif m1:
0670                 mg1 = remap(m1.group(1))
0671                 currChar = int(mg1, 16)
0672                 if len(mg1) > 4:
0673                     drop = 1
0674                     continue
0675             elif drop == 1:
0676                 continue
0677             elif m2:
0678                 value = m2.group(1)
0679                 details.addEntry(currChar, "alias", value)
0680             elif m3:
0681                 value = m3.group(1)
0682                 details.addEntry(currChar, "note", value)
0683             elif m4:
0684                 value = m4.group(1)
0685                 details.addEntry(currChar, "approxEquiv", value)
0686             elif m5:
0687                 value = m5.group(1)
0688                 details.addEntry(currChar, "equiv", value)
0689             elif m6:
0690                 value = int(remap(m6.group(1)), 16)
0691                 if value < 0x10000:
0692                     details.addEntry(currChar, "seeAlso", value)
0693             elif m7:
0694                 value = int(remap(m7.group(1)), 16)
0695                 if value < 0x10000:
0696                     details.addEntry(currChar, "seeAlso", value)
0697     def parseBlocks(self, inBlocks, sectionsBlocks):
0698         regexp = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$')
0699         for line in inBlocks:
0700             line = line[:-1]
0701             m = regexp.match(line)
0702             if not m:
0703                 continue
0704             m1 = remap(m.group(1))
0705             m2 = remap(m.group(2))
0706             if len(m1) > 4:
0707                 continue
0708             sectionsBlocks.addBlock(m1, m2, m.group(3))
0709     def parseSections(self, inSections, sectionsBlocks):
0710         currSection = ""
0711         for line in inSections:
0712             line = line[:-1]
0713             if len(line) == 0:
0714                 continue
0715             temp = line.split(" ")
0716             if temp[0] == "SECTION":
0717                 currSection = line[8:]
0718             elif currSection != "":
0719                 sectionsBlocks.addSection(currSection, line)
0720             else:
0721                 print("error in data file")
0722                 sys.exit(1)
0723     def parseUnihan(self, inUnihan, unihan):
0724         regexp = re.compile(r'^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$')
0725         count = 0
0726         for line in inUnihan:
0727             if count % 100000 == 0:
0728                 print("\b."); sys.stdout.flush()
0729             count += 1
0730             line = line[:-1]
0731             m = regexp.match(line)
0732             if not m:
0733                 continue
0734             if len(remap(m.group(1))) <= 4:
0735                 unihan.addUnihan(remap(m.group(1)), m.group(2), m.group(3))
0736
0737 def writeTranslationDummy(out, data):
0738     out.write(b"""/* This file is part of the KDE libraries
0739
0740    SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
0741    SPDX-FileCopyrightText: 2016 John Zaitseff <J.Zaitseff@zap.org.au>
0742
0743    SPDX-License-Identifier: LGPL-2.0-or-later
0744
0745    This file is autogenerated by kcharselect/kcharselect-generate-datafile.py
0746 */\n\n""")
0747     for group in data:
0748         for entry in group[1]:
0749             out.write(b"QT_TRANSLATE_NOOP3(\"KCharSelectData\", \""+entry.encode("utf-8")+b"\", \""+group[0].encode("utf-8")+b"\");\n")
0750
0751 out = open("kcharselect-data", "wb")
0752 outTranslationDummy = open("kcharselect-translation.cpp", "wb")
0753
0754 inUnicodeData = open("UnicodeData.txt", "r")
0755 inNamesList = open("NamesList.txt", "r")
0756 inBlocks = open("Blocks.txt", "r")
0757 inSections = io.StringIO(sectiondata)
0758 inUnihan = open("Unihan_Readings.txt", "r")
0759
0760 if calcsize('=H') != 2 or calcsize('=I') != 4:
0761     print("Error: Sizes of ushort and uint are not 16 and 32 bit as expected")
0762     sys.exit(1)
0763
0764 names = Names()
0765 details = Details()
0766 sectionsBlocks = SectionsBlocks()
0767 unihan = Unihan()
0768
0769 parser = Parser()
0770
0771 print("========== parsing files ===================")
0772 parser.parseUnicodeData(inUnicodeData, names)
0773 print("."); sys.stdout.flush()
0774 parser.parseDetails(inNamesList, details)
0775 print("\b."); sys.stdout.flush()
0776 parser.parseBlocks(inBlocks, sectionsBlocks)
0777 print("\b."); sys.stdout.flush()
0778 parser.parseSections(inSections, sectionsBlocks)
0779 print("\b."); sys.stdout.flush()
0780 parser.parseUnihan(inUnihan, unihan)
0781 print("\b."); sys.stdout.flush()
0782
0783 print("done.")
0784
0785 pos = 0
0786
0787 #write header, size: 40 bytes
0788 print("========== writing header ==================")
0789 out.write(pack("=I", 40))
0790 print("names strings begin", 40)
0791
0792 namesOffsetBegin = names.calculateStringSize() + 40
0793 out.write(pack("=I", namesOffsetBegin))
0794 print("names offsets begin", namesOffsetBegin)
0795
0796 detailsStringBegin = namesOffsetBegin + names.calculateOffsetSize()
0797 out.write(pack("=I", detailsStringBegin))
0798 print("details strings begin", detailsStringBegin)
0799
0800 detailsOffsetBegin = detailsStringBegin + details.calculateStringSize()
0801 out.write(pack("=I", detailsOffsetBegin))
0802 print("details offsets begin", detailsOffsetBegin)
0803
0804 blocksStringBegin = detailsOffsetBegin + details.calculateOffsetSize()
0805 out.write(pack("=I", blocksStringBegin))
0806 print("block strings begin", blocksStringBegin)
0807
0808 blocksOffsetBegin = blocksStringBegin + sectionsBlocks.calculateBlockStringSize()
0809 out.write(pack("=I", blocksOffsetBegin))
0810 print("block offsets begin", blocksOffsetBegin)
0811
0812 sectionStringBegin = blocksOffsetBegin + sectionsBlocks.calculateBlockOffsetSize()
0813 out.write(pack("=I", sectionStringBegin))
0814 print("section strings begin", sectionStringBegin)
0815
0816 sectionOffsetBegin = sectionStringBegin + sectionsBlocks.calculateSectionStringSize()
0817 out.write(pack("=I", sectionOffsetBegin))
0818 print("section offsets begin", sectionOffsetBegin)
0819
0820 unihanStringBegin = sectionOffsetBegin + sectionsBlocks.calculateSectionOffsetSize()
0821 out.write(pack("=I", unihanStringBegin))
0822 print("unihan strings begin", unihanStringBegin)
0823
0824 unihanOffsetBegin = unihanStringBegin + unihan.calculateStringSize()
0825 out.write(pack("=I", unihanOffsetBegin))
0826 print("unihan offsets begin", unihanOffsetBegin)
0827
0828 end = unihanOffsetBegin + unihan.calculateOffsetSize()
0829 print("end should be", end)
0830
0831 pos += 40
0832
0833 print("========== writing data ====================")
0834
0835 pos = names.writeStrings(out, pos)
0836 print("names strings written, position", pos)
0837 pos = names.writeOffsets(out, pos)
0838 print("names offsets written, position", pos)
0839 pos = details.writeStrings(out, pos)
0840 print("details strings written, position", pos)
0841 pos = details.writeOffsets(out, pos)
0842 print("details offsets written, position", pos)
0843 pos = sectionsBlocks.writeBlockStrings(out, pos)
0844 print("block strings written, position", pos)
0845 pos = sectionsBlocks.writeBlockOffsets(out, pos)
0846 print("block offsets written, position", pos)
0847 pos = sectionsBlocks.writeSectionStrings(out, pos)
0848 print("section strings written, position", pos)
0849 pos = sectionsBlocks.writeSectionOffsets(out, pos)
0850 print("section offsets written, position", pos)
0851 pos = unihan.writeStrings(out, pos)
0852 print("unihan strings written, position", pos)
0853 pos = unihan.writeOffsets(out, pos)
0854 print("unihan offsets written, position", pos)
0855
0856 print("========== writing translation dummy  ======")
0857 translationData = [["KCharSelect section name", sectionsBlocks.getSectionList()], ["KCharselect unicode block name",sectionsBlocks.getBlockList()]]
0858 writeTranslationDummy(outTranslationDummy, translationData)
0859 print("done. make sure to copy both kcharselect-data and kcharselect-translation.cpp.")