File indexing completed on 2024-12-01 12:35:03
0001 # -*- coding: UTF-8 -*- 0002 0003 # Script that compiles Transcript property maps from text to binary format. 0004 # Binary format greately speeds up loading of property maps at runtime. 0005 # http://techbase.kde.org/Localization/Concepts/Transcript 0006 # 0007 # Usage: 0008 # ts-pmap-compile.py file.pmap file.pmapc 0009 # 0010 # Works with Python >= 2.6 and >= 3.0. 0011 0012 import locale 0013 import os 0014 import re 0015 import struct 0016 import sys 0017 0018 0019 cmdname = os.path.basename(sys.argv[0]) 0020 lenc = locale.getpreferredencoding() 0021 0022 def error (msg, code=1): 0023 sys.stderr.write(("%s: error: %s\n" % (cmdname, msg)).encode(lenc)) 0024 sys.exit(code) 0025 0026 0027 def count_lines (text, tolen): 0028 return text.count("\n", 0, tolen) + 1 0029 0030 0031 def norm_keystr (text): 0032 # Must do the same as normKeystr() in kdelibs/kdecore/ktranscript.cpp 0033 return re.sub("[\s&]", "", text).lower() 0034 0035 0036 def trim_smart (text): 0037 return re.sub("^\s*\n|\n\s*$", "", text) 0038 0039 0040 def read_pmap (fname): 0041 0042 # Adapted directly from C++ code. 0043 0044 fh = open(fname, "rb") 0045 s = "".join([l.decode("utf8") for l in fh.readlines()]) 0046 fh.close() 0047 0048 s_nextEntry, s_nextKey, s_nextValue = 1, 2, 3 0049 0050 pmap = [] 0051 0052 class END_PROP_PARSE (Exception): pass 0053 try: 0054 slen = len(s) 0055 state = s_nextEntry 0056 ekeys = [] # holds keys for current entry 0057 props = [] # holds properties for current entry 0058 pkey = "" # holds current property key 0059 i = 0 0060 while True: 0061 i_checkpoint = i 0062 0063 if state == s_nextEntry: 0064 while s[i].isspace(): 0065 i += 1 0066 if i >= slen: raise END_PROP_PARSE 0067 0068 if i + 1 >= slen: 0069 error("unexpected end of file %s" % fname) 0070 0071 if s[i] != '#': 0072 # Separator characters for this entry. 0073 key_sep = s[i] 0074 prop_sep = s[i + 1] 0075 if key_sep.isalpha() or prop_sep.isalpha(): 0076 error("separator characters must not be letters " 0077 "at %s:%d" % (fname, count_lines(s, i))) 0078 0079 # Reset all data for current entry. 0080 ekeys = [] 0081 props = [] 0082 pkey = "" 0083 0084 i += 2 0085 state = s_nextKey 0086 0087 else: 0088 # This is a comment, skip to EOL, don't change state. 0089 while s[i] != '\n': 0090 i += 1 0091 if i >= slen: raise END_PROP_PARSE 0092 0093 elif state == s_nextKey: 0094 ip = i 0095 # Proceed up to next key or property separator. 0096 while s[i] != key_sep and s[i] != prop_sep: 0097 i += 1 0098 if i >= slen: raise END_PROP_PARSE 0099 0100 if s[i] == key_sep: 0101 # This is a property key, 0102 # record for when the value gets parsed. 0103 pkey = norm_keystr(s[ip:i]) 0104 0105 i += 1 0106 state = s_nextValue 0107 0108 else: # if (s[i] == prop_sep 0109 # This is an entry key, or end of entry. 0110 ekey = norm_keystr(s[ip:i]) 0111 if ekey: 0112 # An entry key. 0113 ekeys.append(ekey) 0114 0115 i += 1 0116 state = s_nextKey 0117 0118 else: 0119 # End of entry. 0120 if len(ekeys) < 1: 0121 error("no entry key for entry ending " 0122 "at %s:%d" % (fname, count_lines(s, i))) 0123 0124 # Put collected properties into global store. 0125 pmap.append((ekeys, props)) 0126 0127 i += 1 0128 state = s_nextEntry 0129 # This check covers no newline at end of file. 0130 if i >= slen: raise END_PROP_PARSE 0131 0132 elif state == s_nextValue: 0133 ip = i 0134 # Proceed up to next property separator. 0135 while s[i] != prop_sep: 0136 i += 1 0137 if i >= slen: raise END_PROP_PARSE 0138 if s[i] == key_sep: 0139 error("property separator inside property value " 0140 "at %s:%d" % (fname, count_lines(s, i))) 0141 0142 # Extract the property value and store the property. 0143 pval = trim_smart(s[ip:i]) 0144 props.append((pkey, pval)) 0145 0146 i += 1 0147 state = s_nextKey 0148 0149 else: 0150 error("internal error 10 " 0151 "at %s:%d" % (fname, count_lines(s, i))) 0152 0153 # To avoid infinite looping and stepping out. 0154 if i == i_checkpoint or i >= slen: 0155 error("internal error 20 " 0156 "at %s:%d" % (fname, count_lines(s, i))) 0157 0158 except END_PROP_PARSE: 0159 if state != s_nextEntry: 0160 error("unexpected end of file in %s" % fname) 0161 0162 return pmap 0163 0164 0165 # Convert integer to 32-bit big-endian byte sequence. 0166 def int_bin_32 (val): 0167 return struct.pack(">i", val)[-4:] 0168 0169 0170 # Convert integer to 64-bit big-endian byte sequence. 0171 def int_bin_64 (val): 0172 return struct.pack(">q", val)[-8:] 0173 0174 0175 # Convert string to UTF-8 byte sequence, 0176 # preceded by its length in 32-bit big-endian. 0177 def str_bin_32 (val): 0178 val_enc = val.encode("utf8") 0179 return int_bin_32(len(val_enc)) + val_enc 0180 0181 0182 # Concatenate byte sequence. 0183 def catb (seq): 0184 return bytes().join(seq) 0185 0186 0187 # Binary map format 00. 0188 def write_map_bin_00 (fh, pmap): 0189 0190 # Magic bytes. 0191 fh.write("TSPMAP00".encode("ascii")) 0192 0193 # Number of entries. 0194 fh.write(int_bin_32(len(pmap))) 0195 0196 for ekeys, props in pmap: 0197 # Number of phrase keys and all phrase keys. 0198 fh.write(int_bin_32(len(ekeys))) 0199 for ekey in ekeys: 0200 fh.write(str_bin_32(ekey)) 0201 0202 # Number of properties and all properties. 0203 fh.write(int_bin_32(len(props))) 0204 for pkey, pval in props: 0205 fh.write(str_bin_32(pkey)) 0206 fh.write(str_bin_32(pval)) 0207 0208 0209 # Binary map format 01. 0210 def write_map_bin_01 (fh, pmap): 0211 0212 offset0 = 0 0213 binint32len = len(int_bin_32(0)) 0214 binint64len = len(int_bin_64(0)) 0215 0216 # Magic bytes. 0217 mbytestr = "TSPMAP01".encode("ascii") 0218 offset0 += len(mbytestr) 0219 0220 # Compute length of binary representation of all entry keys 0221 # additionally equipped with offsets to corresponding property blobs. 0222 offset0 += binint32len 0223 offset0 += binint64len 0224 binekeyslen = 0 0225 for ekeys, d1 in pmap: 0226 binekeyslen += sum([len(str_bin_32(x)) + binint64len for x in ekeys]) 0227 offset0 += binekeyslen 0228 0229 # Construct binary representations of all unique property keys. 0230 offset0 += binint32len 0231 offset0 += binint64len 0232 allpkeys = set() 0233 for d1, props in pmap: 0234 allpkeys.update([x[0] for x in props]) 0235 binpkeys = catb(map(str_bin_32, sorted(allpkeys))) 0236 offset0 += len(binpkeys) 0237 0238 # Construct binary representations of properties for each entry. 0239 # Compute byte offsets for each of these binary blobs, in the given order. 0240 binprops = [] 0241 plength = 0 0242 poffset = offset0 + binint32len 0243 for d1, props in pmap: 0244 cbinprops = catb(sum([list(map(str_bin_32, x)) for x in props], [])) 0245 cbinprops = catb([int_bin_32(len(props)), int_bin_32(len(cbinprops)), 0246 cbinprops]) 0247 offset = poffset + plength 0248 binprops.append([cbinprops, offset]) 0249 poffset = offset 0250 plength = len(cbinprops) 0251 0252 # Construct binary representations of all entry keys with property offsets. 0253 allekeys = [] 0254 binekeys = [] 0255 for (ekeys, d1), (d2, offset) in zip(pmap, binprops): 0256 binoffset = int_bin_64(offset) 0257 cbinekeys = catb([str_bin_32(x) + binoffset for x in ekeys]) 0258 binekeys.append(cbinekeys) 0259 allekeys.extend(ekeys) 0260 binekeys = catb(binekeys) 0261 assert(binekeyslen == len(binekeys)) 0262 0263 # Write everything out. 0264 fh.write(mbytestr) 0265 fh.write(int_bin_32(len(allekeys))) 0266 fh.write(int_bin_64(len(binekeys))) 0267 fh.write(binekeys) 0268 fh.write(int_bin_32(len(allpkeys))) 0269 fh.write(int_bin_64(len(binpkeys))) 0270 fh.write(binpkeys) 0271 fh.write(int_bin_32(len(pmap))) 0272 for cbinprops, d1 in binprops: 0273 fh.write(cbinprops) 0274 0275 0276 def main (): 0277 0278 if len(sys.argv) != 3: 0279 error("usage: %s INPUT_FILE OUTPUT_FILE" % cmdname) 0280 0281 try: 0282 import psyco 0283 psyco.full() 0284 except ImportError: 0285 pass 0286 0287 ifile = sys.argv[1] 0288 ofile = sys.argv[2] 0289 0290 pmap = read_pmap(ifile) 0291 ofh = open(ofile, "wb") 0292 write_map_bin_01(ofh, pmap) 0293 ofh.close() 0294 0295 0296 if __name__ == '__main__': 0297 main()