File indexing completed on 2025-02-02 14:19:56
0001 #!/usr/bin/python3 0002 # -*- coding: utf-8 -*- 0003 # 0004 # This script generates a data file containing all Unicode information needed 0005 # by KCharSelect. 0006 # 0007 ############################################################################## 0008 # SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de> 0009 # SPDX-FileCopyrightText: 2016 John Zaitseff <J.Zaitseff@zap.org.au> 0010 # 0011 # SPDX-License-Identifier: LGPL-2.0-or-later 0012 ############################################################################## 0013 # 0014 # The current directory must contain the following files that can be found at 0015 # http://www.unicode.org/Public/UNIDATA/: 0016 # - UnicodeData.txt 0017 # - Unihan_Readings.txt (you need to uncompress it from Unihan.zip) 0018 # - NamesList.txt 0019 # - Blocks.txt 0020 # 0021 # The generated file is named "kcharselect-data" and has to be put in 0022 # kwidgetsaddons/src. Additionally a translation dummy named 0023 # "kcharselect-translation.cpp" is generated and has to be placed in the same 0024 # directory. 0025 # 0026 # FILE STRUCTURE 0027 # 0028 # The generated file is a binary file. The first 40 bytes are the header and 0029 # contain the position of each part of the file. Each entry is uint32. 0030 # 0031 # pos content 0032 # 0 names strings begin 0033 # 4 names offsets begin 0034 # 8 details strings begin 0035 # 12 details offsets begin 0036 # 16 block strings begin 0037 # 20 block offsets begin 0038 # 24 section strings begin 0039 # 28 section offsets begin 0040 # 32 unihan strings begin 0041 # 36 unihan offsets begin 0042 # 0043 # The string parts always contain all strings in a row, followed by a 0x00 0044 # byte. There is one exception: The data for seeAlso in details is only 2 0045 # bytes (as is always is _one_ unicode character) and _not_ followed by a 0x00 0046 # byte. 0047 # 0048 # The offset parts contain entries with a fixed length. Unicode characters 0049 # are always uint16 and offsets uint32. Offsets are positions in the data 0050 # file. 0051 # 0052 # names_offsets: 0053 # each entry 6 bytes 0054 # 16bit: unicode 0055 # 32bit: offset to name in names_strings 0056 # 0057 # names_strings: 0058 # the first byte is the category (same values as QChar::Category), 0059 # directly followed by the character name (terminated by 0x00) 0060 # 0061 # nameslist_offsets: 0062 # char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_coutn, equiv, equiv_count, seeAlso, seeAlso_count 0063 # 16 32 8 32 8 32 8 32 8 32 8 0064 # => each entry 27 bytes 0065 # 0066 # blocks_offsets: 0067 # each entry 4 bytes 0068 # 16bit: start unicode 0069 # 16bit: end unicode 0070 # Note that there is no string offset. 0071 # 0072 # section_offsets: 0073 # each entry 4 bytes 0074 # 16bit: section offset 0075 # 16bit: block offset 0076 # Note that these offsets are _not_ positions in the data file but indexes. 0077 # For example 0x0403 means the fourth section includes the third block. 0078 # 0079 # unihan_offsets: 0080 # each entry 30 bytes 0081 # 16bit: unicode 0082 # 32bit: offset to unihan_strings for Definition 0083 # 32bit: offset to unihan_strings for Cantonese 0084 # 32bit: offset to unihan_strings for Mandarin 0085 # 32bit: offset to unihan_strings for Tang 0086 # 32bit: offset to unihan_strings for Korean 0087 # 32bit: offset to unihan_strings for JapaneseKun 0088 # 32bit: offset to unihan_strings for JapaneseOn 0089 0090 from struct import * 0091 import sys 0092 import re 0093 import io 0094 0095 # Based on http://www.unicode.org/charts/, updated for Unicode 9.0 0096 sectiondata = ''' 0097 SECTION European Scripts 0098 Basic Latin 0099 Latin-1 Supplement 0100 Latin Extended-A 0101 Latin Extended-B 0102 Latin Extended-C 0103 Latin Extended-D 0104 Latin Extended-E 0105 Latin Extended Additional 0106 Armenian 0107 Coptic 0108 Cyrillic 0109 Cyrillic Supplement 0110 Cyrillic Extended-A 0111 Cyrillic Extended-B 0112 Cyrillic Extended-C 0113 Georgian 0114 Georgian Supplement 0115 Georgian Extended 0116 Glagolitic 0117 Greek and Coptic 0118 Greek Extended 0119 Ogham 0120 Runic 0121 0122 SECTION African Scripts 0123 Bamum 0124 Ethiopic 0125 Ethiopic Supplement 0126 Ethiopic Extended 0127 Ethiopic Extended-A 0128 NKo 0129 Tifinagh 0130 Vai 0131 0132 SECTION Middle Eastern Scripts 0133 Arabic 0134 Arabic Supplement 0135 Arabic Extended-A 0136 Arabic Extended-B 0137 Arabic Presentation Forms-A 0138 Arabic Presentation Forms-B 0139 Hebrew 0140 Mandaic 0141 Samaritan 0142 Syriac 0143 Syriac Supplement 0144 0145 SECTION Central Asian Scripts 0146 Mongolian 0147 Phags-pa 0148 Tibetan 0149 0150 SECTION South Asian Scripts 0151 Bengali 0152 Common Indic Number Forms 0153 Devanagari 0154 Devanagari Extended 0155 Gujarati 0156 Gurmukhi 0157 Kannada 0158 Lepcha 0159 Limbu 0160 Malayalam 0161 Meetei Mayek 0162 Meetei Mayek Extensions 0163 Ol Chiki 0164 Oriya 0165 Saurashtra 0166 Sinhala 0167 Syloti Nagri 0168 Tamil 0169 Telugu 0170 Thaana 0171 Vedic Extensions 0172 0173 SECTION Southeast Asian Scripts 0174 Cham 0175 Kayah Li 0176 Khmer 0177 Khmer Symbols 0178 Lao 0179 Myanmar 0180 Myanmar Extended-A 0181 Myanmar Extended-B 0182 New Tai Lue 0183 Tai Le 0184 Tai Tham 0185 Tai Viet 0186 Thai 0187 0188 SECTION Indonesia and Oceania Scripts 0189 Balinese 0190 Batak 0191 Buginese 0192 Buhid 0193 Hanunoo 0194 Javanese 0195 Rejang 0196 Sundanese 0197 Sundanese Supplement 0198 Tagalog 0199 Tagbanwa 0200 0201 SECTION East Asian Scripts 0202 Bopomofo 0203 Bopomofo Extended 0204 CJK Unified Ideographs 0205 CJK Unified Ideographs Extension A 0206 CJK Compatibility 0207 CJK Compatibility Ideographs 0208 CJK Compatibility Forms 0209 CJK Radicals Supplement 0210 CJK Strokes 0211 CJK Symbols and Punctuation 0212 Enclosed CJK Letters and Months 0213 Hangul Jamo 0214 Hangul Jamo Extended-A 0215 Hangul Jamo Extended-B 0216 Hangul Compatibility Jamo 0217 Hangul Syllables 0218 Hiragana 0219 Ideographic Description Characters 0220 Kanbun 0221 Kangxi Radicals 0222 Katakana 0223 Katakana Phonetic Extensions 0224 Lisu 0225 Yi Radicals 0226 Yi Syllables 0227 0228 SECTION American Scripts 0229 Cherokee 0230 Cherokee Supplement 0231 Unified Canadian Aboriginal Syllabics 0232 Unified Canadian Aboriginal Syllabics Extended 0233 0234 SECTION Symbols 0235 General Punctuation 0236 Alchemical Symbols 0237 Braille Patterns 0238 Chess Symbols 0239 Control Pictures 0240 Currency Symbols 0241 Dingbats 0242 Domino Tiles 0243 Emoticons 0244 Enclosed Alphanumerics 0245 Enclosed Alphanumeric Supplement 0246 Enclosed Ideographic Supplement 0247 Mahjong Tiles 0248 Miscellaneous Symbols 0249 Miscellaneous Symbols and Pictographs 0250 Miscellaneous Technical 0251 Optical Character Recognition 0252 Ornamental Dingbats 0253 Playing Cards 0254 Small Form Variants 0255 Supplemental Punctuation 0256 Supplemental Symbols and Pictographs 0257 Symbols and Pictographs Extended-A 0258 Symbols for Legacy Computing 0259 Transport and Map Symbols 0260 Vertical Forms 0261 Yijing Hexagram Symbols 0262 0263 SECTION Mathematical Symbols 0264 Arrows 0265 Block Elements 0266 Box Drawing 0267 Geometric Shapes 0268 Geometric Shapes Extended 0269 Letterlike Symbols 0270 Mathematical Operators 0271 Miscellaneous Mathematical Symbols-A 0272 Miscellaneous Mathematical Symbols-B 0273 Miscellaneous Symbols and Arrows 0274 Number Forms 0275 Superscripts and Subscripts 0276 Supplemental Arrows-A 0277 Supplemental Arrows-B 0278 Supplemental Arrows-C 0279 Supplemental Mathematical Operators 0280 0281 SECTION Phonetic Symbols 0282 IPA Extensions 0283 Modifier Tone Letters 0284 Phonetic Extensions 0285 Phonetic Extensions Supplement 0286 Spacing Modifier Letters 0287 0288 SECTION Combining Diacritics 0289 Combining Diacritical Marks 0290 Combining Diacritical Marks Extended 0291 Combining Diacritical Marks Supplement 0292 Combining Diacritical Marks for Symbols 0293 Combining Half Marks 0294 0295 SECTION Other 0296 Alphabetic Presentation Forms 0297 Halfwidth and Fullwidth Forms 0298 High Private Use Surrogates 0299 High Surrogates 0300 Low Surrogates 0301 Private Use Area 0302 Specials 0303 Variation Selectors 0304 ''' 0305 0306 categoryMap = { # same values as QChar::Category 0307 "Mn": 1, 0308 "Mc": 2, 0309 "Me": 3, 0310 "Nd": 4, 0311 "Nl": 5, 0312 "No": 6, 0313 "Zs": 7, 0314 "Zl": 8, 0315 "Zp": 9, 0316 "Cc": 10, 0317 "Cf": 11, 0318 "Cs": 12, 0319 "Co": 13, 0320 "Cn": 14, 0321 "Lu": 15, 0322 "Ll": 16, 0323 "Lt": 17, 0324 "Lm": 18, 0325 "Lo": 19, 0326 "Pc": 20, 0327 "Pd": 21, 0328 "Ps": 22, 0329 "Pe": 23, 0330 "Pi": 24, 0331 "Pf": 25, 0332 "Po": 26, 0333 "Sm": 27, 0334 "Sc": 28, 0335 "Sk": 29, 0336 "So": 30 0337 } 0338 0339 0340 # Temporary code point remapping 0341 # 0342 # Initial SMP support without needing a new data file format 0343 # - BMP U+Fxxx are remapped to U+Exxx 0344 # - SMP symbols U+1Fxxx are remapped to U+Fxxx 0345 # - Private Use Area is limited to U+F000 ... U+F8FF 0346 0347 def remap(char): 0348 cp = int(char, 16) 0349 if cp >= 0xE000 and cp <= 0xFFFF: 0350 return "E"+char[1:] 0351 if cp >= 0x1F000 and cp <= 0x1FFFF: 0352 return char[1:] 0353 return char 0354 0355 class Names: 0356 def __init__(self): 0357 self.names = [] 0358 self.controlpos = -1 0359 def addName(self, uni, name, category): 0360 self.names.append([uni, name, category]) 0361 0362 def calculateStringSize(self): 0363 size = 0 0364 hadcontrol = False 0365 for entry in self.names: 0366 if entry[1] == "<control>": 0367 if not hadcontrol: 0368 size += len(entry[1]) + 2 0369 hadcontrol = True 0370 else: 0371 size += len(entry[1]) + 2 0372 return size 0373 0374 def calculateOffsetSize(self): 0375 return len(self.names)*6 0376 0377 def writeStrings(self, out, pos): 0378 hadcontrol = False 0379 for entry in self.names: 0380 if entry[1] == "<control>": 0381 if not hadcontrol: 0382 out.write(pack("=b", entry[2])) 0383 out.write(entry[1].encode("utf-8") + b"\0") 0384 size = len(entry[1]) + 2 0385 entry[1] = pos 0386 self.controlpos = pos 0387 pos += size 0388 hadcontrol = True 0389 else: 0390 entry[1] = self.controlpos 0391 else: 0392 out.write(pack("=b", entry[2])) 0393 out.write(entry[1].encode("utf-8") + b"\0") 0394 size = len(entry[1]) + 2 0395 entry[1] = pos 0396 pos += size 0397 return pos 0398 0399 def writeOffsets(self, out, pos): 0400 for entry in self.names: 0401 out.write(pack("=HI", int(entry[0], 16), entry[1])) 0402 pos += 6 0403 return pos 0404 0405 class Details: 0406 def __init__(self): 0407 self.details = {} 0408 def addEntry(self, char, category, text): 0409 if not char in self.details: 0410 self.details[char] = {} 0411 if not category in self.details[char]: 0412 self.details[char][category] = [] 0413 self.details[char][category].append(text) 0414 0415 def calculateStringSize(self): 0416 size = 0 0417 for char in self.details.values(): 0418 for cat in char.values(): 0419 for s in cat: 0420 if type(s) is str: 0421 size += len(s.encode("utf-8")) + 1 0422 else: 0423 size += 2 0424 return size 0425 0426 def calculateOffsetSize(self): 0427 return len(self.details)*27 0428 0429 def writeStrings(self, out, pos): 0430 for char in self.details.values(): 0431 for cat in char.values(): 0432 for i in range(0, len(cat)): 0433 s = cat[i] 0434 if type(s) is str: 0435 out.write(s.encode("utf-8") + b"\0") 0436 size = len(s.encode("utf-8")) + 1 0437 else: 0438 out.write(pack("=H", s)) 0439 size = 2 0440 cat[i] = pos 0441 pos += size 0442 return pos 0443 0444 def writeOffsets(self, out, pos): 0445 for char in self.details.keys(): 0446 alias = 0 0447 alias_count = 0 0448 note = 0 0449 note_count = 0 0450 approxEquiv = 0 0451 approxEquiv_count = 0 0452 equiv = 0 0453 equiv_count = 0 0454 seeAlso = 0 0455 seeAlso_count = 0 0456 if "alias" in self.details[char]: 0457 alias = self.details[char]["alias"][0] 0458 alias_count = len(self.details[char]["alias"]) 0459 0460 if "note" in self.details[char]: 0461 note = self.details[char]["note"][0] 0462 note_count = len(self.details[char]["note"]) 0463 0464 if "approxEquiv" in self.details[char]: 0465 approxEquiv = self.details[char]["approxEquiv"][0] 0466 approxEquiv_count = len(self.details[char]["approxEquiv"]) 0467 0468 if "equiv" in self.details[char]: 0469 equiv = self.details[char]["equiv"][0] 0470 equiv_count = len(self.details[char]["equiv"]) 0471 0472 if "seeAlso" in self.details[char]: 0473 seeAlso = self.details[char]["seeAlso"][0] 0474 seeAlso_count = len(self.details[char]["seeAlso"]) 0475 0476 out.write(pack("=HIbIbIbIbIb", char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_count, equiv, equiv_count, seeAlso, seeAlso_count)) 0477 pos += 27 0478 0479 return pos 0480 0481 class SectionsBlocks: 0482 def __init__(self): 0483 self.sections = [] 0484 self.blocks = [] 0485 self.blockList = [] 0486 self.sectionList = [] 0487 0488 def addBlock(self, begin, end, name): 0489 self.blocks.append([begin, end, name]) 0490 self.blockList.append(name) 0491 0492 def addSection(self, section, block): 0493 self.sections.append([section, block]) 0494 if not section in self.sectionList: 0495 self.sectionList.append(section) 0496 0497 def calculateBlockStringSize(self): 0498 size = 0 0499 for block in self.blocks: 0500 size += len(block[2]) + 1 0501 return size 0502 0503 def calculateBlockOffsetSize(self): 0504 return len(self.blocks) * 4 0505 0506 def calculateSectionStringSize(self): 0507 size = 0 0508 lastsection = "" 0509 for section in self.sections: 0510 if section[0] != lastsection: 0511 size += len(section[0]) + 1 0512 lastsection = section[0] 0513 return size 0514 0515 def calculateSectionOffsetSize(self): 0516 return len(self.sections) * 4 0517 0518 def writeBlockStrings(self, out, pos): 0519 index = 0 0520 for block in self.blocks: 0521 out.write(block[2].encode("utf-8") + b"\0") 0522 size = len(block[2].encode("utf-8")) + 1 0523 found = False 0524 for section in self.sections: 0525 if section[1] == block[2]: 0526 print("found", section) 0527 section[1] = index 0528 found = True 0529 if not found: 0530 print("Error: Did not find any category for block \""+block[2]+"\"") 0531 sys.exit(1) 0532 block[2] = index 0533 pos += size 0534 index += 1 0535 return pos 0536 0537 def writeBlockOffsets(self, out, pos): 0538 for block in self.blocks: 0539 out.write(pack("=HH", int(block[0], 16), int(block[1], 16))) 0540 pos += 4 0541 return pos 0542 0543 def writeSectionStrings(self, out, pos): 0544 lastsection = "" 0545 lastpos = 0 0546 index = -1 0547 for section in self.sections: 0548 if section[0] != lastsection: 0549 index += 1 0550 lastsection = section[0] 0551 out.write(section[0].encode("utf-8") + b"\0") 0552 size = len(section[0].encode("utf-8")) + 1 0553 section[0] = index 0554 lastpos = pos 0555 pos += size 0556 else: 0557 section[0] = index 0558 return pos 0559 0560 def writeSectionOffsets(self, out, pos): 0561 for section in self.sections: 0562 out.write(pack("=HH", section[0], section[1])) 0563 pos += 4 0564 return pos 0565 0566 def getBlockList(self): 0567 return self.blockList 0568 0569 def getSectionList(self): 0570 return self.sectionList 0571 0572 class Unihan: 0573 def __init__(self): 0574 self.unihan = {} 0575 0576 def addUnihan(self, uni, category, value): 0577 uni = int(uni, 16) 0578 if category != "kDefinition" and category != "kCantonese" and category != "kMandarin" and category != "kTang" and category != "kKorean" and category != "kJapaneseKun" and category != "kJapaneseOn": 0579 return 0580 if not uni in self.unihan: 0581 self.unihan[uni] = [None, None, None, None, None, None, None] 0582 if category == "kDefinition": 0583 self.unihan[uni][0] = value 0584 elif category == "kCantonese": 0585 self.unihan[uni][1] = value 0586 elif category == "kMandarin": 0587 self.unihan[uni][2] = value 0588 elif category == "kTang": 0589 self.unihan[uni][3] = value 0590 elif category == "kKorean": 0591 self.unihan[uni][4] = value 0592 elif category == "kJapaneseKun": 0593 self.unihan[uni][5] = value 0594 elif category == "kJapaneseOn": 0595 self.unihan[uni][6] = value 0596 0597 def calculateStringSize(self): 0598 size = 0 0599 for char in self.unihan.keys(): 0600 for entry in self.unihan[char]: 0601 if entry != None: 0602 size += len(entry.encode("utf-8")) + 1 0603 return size 0604 0605 def calculateOffsetSize(self): 0606 return len(self.unihan) * 30 0607 0608 def writeStrings(self, out, pos): 0609 for char in self.unihan.keys(): 0610 for i in range(0, 7): 0611 if self.unihan[char][i] != None: 0612 out.write(self.unihan[char][i].encode("utf-8") + b"\0") 0613 size = len(self.unihan[char][i].encode("utf-8")) + 1 0614 self.unihan[char][i] = pos 0615 pos += size 0616 return pos 0617 0618 def writeOffsets(self, out, pos): 0619 for char in self.unihan.keys(): 0620 out.write(pack("=H", char)) 0621 for i in range(0, 7): 0622 if self.unihan[char][i] != None: 0623 out.write(pack("=I", self.unihan[char][i])) 0624 else: 0625 out.write(pack("=I", 0)) 0626 pos += 30 0627 return pos 0628 0629 class Parser: 0630 def parseUnicodeData(self, inUnicodeData, names): 0631 regexp = re.compile(r'^([^;]+);([^;]+);([^;]+)') 0632 for line in inUnicodeData: 0633 line = line[:-1] 0634 m = regexp.match(line) 0635 if not m: 0636 continue 0637 uni = remap(m.group(1)) 0638 name = m.group(2) 0639 category = m.group(3) 0640 if len(uni) > 4: 0641 continue 0642 names.addName(uni, name, categoryMap[category]) 0643 0644 def parseDetails(self, inNamesList, details): 0645 invalidRegexp = re.compile(r'^@') 0646 unicodeRegexp = re.compile(r'^([0-9A-F]+)') 0647 0648 aliasRegexp = re.compile(r'^\s+=\s+(.+)$') #equal 0649 seeAlsoRegexp1 = re.compile(r'^\s+x\s+.*\s([0-9A-F]{4,6})\)$') #ex 0650 seeAlsoRegexp2 = re.compile(r'^\s+x\s+([0-9A-F]{4,6})$') #ex 0651 noteRegexp = re.compile(r'^\s+\*\s+(.+)$') #star 0652 approxEquivalentRegexp = re.compile(r'^\s+#\s+(.+)$') #pound 0653 equivalentRegexp = re.compile(r'^\s+:\s+(.+)$') #colon 0654 0655 drop = 0 0656 currChar = 0 0657 0658 for line in inNamesList: 0659 line = line[:-1] 0660 m1 = unicodeRegexp.match(line) 0661 m2 = aliasRegexp.match(line) 0662 m3 = noteRegexp.match(line) 0663 m4 = approxEquivalentRegexp.match(line) 0664 m5 = equivalentRegexp.match(line) 0665 m6 = seeAlsoRegexp1.match(line) 0666 m7 = seeAlsoRegexp2.match(line) 0667 if invalidRegexp.match(line): 0668 continue 0669 elif m1: 0670 mg1 = remap(m1.group(1)) 0671 currChar = int(mg1, 16) 0672 if len(mg1) > 4: 0673 drop = 1 0674 continue 0675 elif drop == 1: 0676 continue 0677 elif m2: 0678 value = m2.group(1) 0679 details.addEntry(currChar, "alias", value) 0680 elif m3: 0681 value = m3.group(1) 0682 details.addEntry(currChar, "note", value) 0683 elif m4: 0684 value = m4.group(1) 0685 details.addEntry(currChar, "approxEquiv", value) 0686 elif m5: 0687 value = m5.group(1) 0688 details.addEntry(currChar, "equiv", value) 0689 elif m6: 0690 value = int(remap(m6.group(1)), 16) 0691 if value < 0x10000: 0692 details.addEntry(currChar, "seeAlso", value) 0693 elif m7: 0694 value = int(remap(m7.group(1)), 16) 0695 if value < 0x10000: 0696 details.addEntry(currChar, "seeAlso", value) 0697 def parseBlocks(self, inBlocks, sectionsBlocks): 0698 regexp = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$') 0699 for line in inBlocks: 0700 line = line[:-1] 0701 m = regexp.match(line) 0702 if not m: 0703 continue 0704 m1 = remap(m.group(1)) 0705 m2 = remap(m.group(2)) 0706 if len(m1) > 4: 0707 continue 0708 sectionsBlocks.addBlock(m1, m2, m.group(3)) 0709 def parseSections(self, inSections, sectionsBlocks): 0710 currSection = "" 0711 for line in inSections: 0712 line = line[:-1] 0713 if len(line) == 0: 0714 continue 0715 temp = line.split(" ") 0716 if temp[0] == "SECTION": 0717 currSection = line[8:] 0718 elif currSection != "": 0719 sectionsBlocks.addSection(currSection, line) 0720 else: 0721 print("error in data file") 0722 sys.exit(1) 0723 def parseUnihan(self, inUnihan, unihan): 0724 regexp = re.compile(r'^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$') 0725 count = 0 0726 for line in inUnihan: 0727 if count % 100000 == 0: 0728 print("\b."); sys.stdout.flush() 0729 count += 1 0730 line = line[:-1] 0731 m = regexp.match(line) 0732 if not m: 0733 continue 0734 if len(remap(m.group(1))) <= 4: 0735 unihan.addUnihan(remap(m.group(1)), m.group(2), m.group(3)) 0736 0737 def writeTranslationDummy(out, data): 0738 out.write(b"""/* This file is part of the KDE libraries 0739 0740 SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de> 0741 SPDX-FileCopyrightText: 2016 John Zaitseff <J.Zaitseff@zap.org.au> 0742 0743 SPDX-License-Identifier: LGPL-2.0-or-later 0744 0745 This file is autogenerated by kcharselect/kcharselect-generate-datafile.py 0746 */\n\n""") 0747 for group in data: 0748 for entry in group[1]: 0749 out.write(b"QT_TRANSLATE_NOOP3(\"KCharSelectData\", \""+entry.encode("utf-8")+b"\", \""+group[0].encode("utf-8")+b"\");\n") 0750 0751 out = open("kcharselect-data", "wb") 0752 outTranslationDummy = open("kcharselect-translation.cpp", "wb") 0753 0754 inUnicodeData = open("UnicodeData.txt", "r") 0755 inNamesList = open("NamesList.txt", "r") 0756 inBlocks = open("Blocks.txt", "r") 0757 inSections = io.StringIO(sectiondata) 0758 inUnihan = open("Unihan_Readings.txt", "r") 0759 0760 if calcsize('=H') != 2 or calcsize('=I') != 4: 0761 print("Error: Sizes of ushort and uint are not 16 and 32 bit as expected") 0762 sys.exit(1) 0763 0764 names = Names() 0765 details = Details() 0766 sectionsBlocks = SectionsBlocks() 0767 unihan = Unihan() 0768 0769 parser = Parser() 0770 0771 print("========== parsing files ===================") 0772 parser.parseUnicodeData(inUnicodeData, names) 0773 print("."); sys.stdout.flush() 0774 parser.parseDetails(inNamesList, details) 0775 print("\b."); sys.stdout.flush() 0776 parser.parseBlocks(inBlocks, sectionsBlocks) 0777 print("\b."); sys.stdout.flush() 0778 parser.parseSections(inSections, sectionsBlocks) 0779 print("\b."); sys.stdout.flush() 0780 parser.parseUnihan(inUnihan, unihan) 0781 print("\b."); sys.stdout.flush() 0782 0783 print("done.") 0784 0785 pos = 0 0786 0787 #write header, size: 40 bytes 0788 print("========== writing header ==================") 0789 out.write(pack("=I", 40)) 0790 print("names strings begin", 40) 0791 0792 namesOffsetBegin = names.calculateStringSize() + 40 0793 out.write(pack("=I", namesOffsetBegin)) 0794 print("names offsets begin", namesOffsetBegin) 0795 0796 detailsStringBegin = namesOffsetBegin + names.calculateOffsetSize() 0797 out.write(pack("=I", detailsStringBegin)) 0798 print("details strings begin", detailsStringBegin) 0799 0800 detailsOffsetBegin = detailsStringBegin + details.calculateStringSize() 0801 out.write(pack("=I", detailsOffsetBegin)) 0802 print("details offsets begin", detailsOffsetBegin) 0803 0804 blocksStringBegin = detailsOffsetBegin + details.calculateOffsetSize() 0805 out.write(pack("=I", blocksStringBegin)) 0806 print("block strings begin", blocksStringBegin) 0807 0808 blocksOffsetBegin = blocksStringBegin + sectionsBlocks.calculateBlockStringSize() 0809 out.write(pack("=I", blocksOffsetBegin)) 0810 print("block offsets begin", blocksOffsetBegin) 0811 0812 sectionStringBegin = blocksOffsetBegin + sectionsBlocks.calculateBlockOffsetSize() 0813 out.write(pack("=I", sectionStringBegin)) 0814 print("section strings begin", sectionStringBegin) 0815 0816 sectionOffsetBegin = sectionStringBegin + sectionsBlocks.calculateSectionStringSize() 0817 out.write(pack("=I", sectionOffsetBegin)) 0818 print("section offsets begin", sectionOffsetBegin) 0819 0820 unihanStringBegin = sectionOffsetBegin + sectionsBlocks.calculateSectionOffsetSize() 0821 out.write(pack("=I", unihanStringBegin)) 0822 print("unihan strings begin", unihanStringBegin) 0823 0824 unihanOffsetBegin = unihanStringBegin + unihan.calculateStringSize() 0825 out.write(pack("=I", unihanOffsetBegin)) 0826 print("unihan offsets begin", unihanOffsetBegin) 0827 0828 end = unihanOffsetBegin + unihan.calculateOffsetSize() 0829 print("end should be", end) 0830 0831 pos += 40 0832 0833 print("========== writing data ====================") 0834 0835 pos = names.writeStrings(out, pos) 0836 print("names strings written, position", pos) 0837 pos = names.writeOffsets(out, pos) 0838 print("names offsets written, position", pos) 0839 pos = details.writeStrings(out, pos) 0840 print("details strings written, position", pos) 0841 pos = details.writeOffsets(out, pos) 0842 print("details offsets written, position", pos) 0843 pos = sectionsBlocks.writeBlockStrings(out, pos) 0844 print("block strings written, position", pos) 0845 pos = sectionsBlocks.writeBlockOffsets(out, pos) 0846 print("block offsets written, position", pos) 0847 pos = sectionsBlocks.writeSectionStrings(out, pos) 0848 print("section strings written, position", pos) 0849 pos = sectionsBlocks.writeSectionOffsets(out, pos) 0850 print("section offsets written, position", pos) 0851 pos = unihan.writeStrings(out, pos) 0852 print("unihan strings written, position", pos) 0853 pos = unihan.writeOffsets(out, pos) 0854 print("unihan offsets written, position", pos) 0855 0856 print("========== writing translation dummy ======") 0857 translationData = [["KCharSelect section name", sectionsBlocks.getSectionList()], ["KCharselect unicode block name",sectionsBlocks.getBlockList()]] 0858 writeTranslationDummy(outTranslationDummy, translationData) 0859 print("done. make sure to copy both kcharselect-data and kcharselect-translation.cpp.")