File indexing completed on 2024-12-01 12:35:03

0001 # -*- coding: UTF-8 -*-
0002 
0003 # Script that compiles Transcript property maps from text to binary format.
0004 # Binary format greately speeds up loading of property maps at runtime.
0005 # http://techbase.kde.org/Localization/Concepts/Transcript
0006 #
0007 # Usage:
0008 #   ts-pmap-compile.py file.pmap file.pmapc
0009 #
0010 # Works with Python >= 2.6 and >= 3.0.
0011 
0012 import locale
0013 import os
0014 import re
0015 import struct
0016 import sys
0017 
0018 
0019 cmdname = os.path.basename(sys.argv[0])
0020 lenc = locale.getpreferredencoding()
0021 
0022 def error (msg, code=1):
0023     sys.stderr.write(("%s: error: %s\n" % (cmdname, msg)).encode(lenc))
0024     sys.exit(code)
0025 
0026 
0027 def count_lines (text, tolen):
0028     return text.count("\n", 0, tolen) + 1
0029 
0030 
0031 def norm_keystr (text):
0032     # Must do the same as normKeystr() in kdelibs/kdecore/ktranscript.cpp
0033     return re.sub("[\s&]", "", text).lower()
0034 
0035 
0036 def trim_smart (text):
0037     return re.sub("^\s*\n|\n\s*$", "", text)
0038 
0039 
0040 def read_pmap (fname):
0041 
0042     # Adapted directly from C++ code.
0043 
0044     fh = open(fname, "rb")
0045     s = "".join([l.decode("utf8") for l in fh.readlines()])
0046     fh.close()
0047 
0048     s_nextEntry, s_nextKey, s_nextValue = 1, 2, 3
0049 
0050     pmap = []
0051 
0052     class END_PROP_PARSE (Exception): pass
0053     try:
0054         slen = len(s)
0055         state = s_nextEntry
0056         ekeys = [] # holds keys for current entry
0057         props = [] # holds properties for current entry
0058         pkey = "" # holds current property key
0059         i = 0
0060         while True:
0061             i_checkpoint = i
0062 
0063             if state == s_nextEntry:
0064                 while s[i].isspace():
0065                     i += 1
0066                     if i >= slen: raise END_PROP_PARSE
0067 
0068                 if i + 1 >= slen:
0069                     error("unexpected end of file %s" % fname)
0070 
0071                 if s[i] != '#':
0072                     # Separator characters for this entry.
0073                     key_sep = s[i]
0074                     prop_sep = s[i + 1]
0075                     if key_sep.isalpha() or prop_sep.isalpha():
0076                         error("separator characters must not be letters "
0077                               "at %s:%d" % (fname, count_lines(s, i)))
0078 
0079                     # Reset all data for current entry.
0080                     ekeys = []
0081                     props = []
0082                     pkey = ""
0083 
0084                     i += 2
0085                     state = s_nextKey
0086 
0087                 else:
0088                     # This is a comment, skip to EOL, don't change state.
0089                     while s[i] != '\n':
0090                         i += 1
0091                         if i >= slen: raise END_PROP_PARSE
0092 
0093             elif state == s_nextKey:
0094                 ip = i
0095                 # Proceed up to next key or property separator.
0096                 while s[i] != key_sep and s[i] != prop_sep:
0097                     i += 1
0098                     if i >= slen: raise END_PROP_PARSE
0099 
0100                 if s[i] == key_sep:
0101                     # This is a property key,
0102                     # record for when the value gets parsed.
0103                     pkey = norm_keystr(s[ip:i])
0104 
0105                     i += 1
0106                     state = s_nextValue
0107 
0108                 else: # if (s[i] == prop_sep
0109                     # This is an entry key, or end of entry.
0110                     ekey = norm_keystr(s[ip:i])
0111                     if ekey:
0112                         # An entry key.
0113                         ekeys.append(ekey)
0114 
0115                         i += 1
0116                         state = s_nextKey
0117 
0118                     else:
0119                         # End of entry.
0120                         if len(ekeys) < 1:
0121                             error("no entry key for entry ending "
0122                                   "at %s:%d" % (fname, count_lines(s, i)))
0123 
0124                         # Put collected properties into global store.
0125                         pmap.append((ekeys, props))
0126 
0127                         i += 1
0128                         state = s_nextEntry
0129                         # This check covers no newline at end of file.
0130                         if i >= slen: raise END_PROP_PARSE
0131 
0132             elif state == s_nextValue:
0133                 ip = i
0134                 # Proceed up to next property separator.
0135                 while s[i] != prop_sep:
0136                     i += 1
0137                     if i >= slen: raise END_PROP_PARSE
0138                     if s[i] == key_sep:
0139                         error("property separator inside property value "
0140                               "at %s:%d" % (fname, count_lines(s, i)))
0141 
0142                 # Extract the property value and store the property.
0143                 pval = trim_smart(s[ip:i])
0144                 props.append((pkey, pval))
0145 
0146                 i += 1
0147                 state = s_nextKey
0148 
0149             else:
0150                 error("internal error 10 "
0151                       "at %s:%d" % (fname, count_lines(s, i)))
0152 
0153             # To avoid infinite looping and stepping out.
0154             if i == i_checkpoint or i >= slen:
0155                 error("internal error 20 "
0156                       "at %s:%d" % (fname, count_lines(s, i)))
0157 
0158     except END_PROP_PARSE:
0159         if state != s_nextEntry:
0160             error("unexpected end of file in %s" % fname)
0161 
0162     return pmap
0163 
0164 
0165 # Convert integer to 32-bit big-endian byte sequence.
0166 def int_bin_32 (val):
0167     return struct.pack(">i", val)[-4:]
0168 
0169 
0170 # Convert integer to 64-bit big-endian byte sequence.
0171 def int_bin_64 (val):
0172     return struct.pack(">q", val)[-8:]
0173 
0174 
0175 # Convert string to UTF-8 byte sequence,
0176 # preceded by its length in 32-bit big-endian.
0177 def str_bin_32 (val):
0178     val_enc = val.encode("utf8")
0179     return int_bin_32(len(val_enc)) + val_enc
0180 
0181 
0182 # Concatenate byte sequence.
0183 def catb (seq):
0184     return bytes().join(seq)
0185 
0186 
0187 # Binary map format 00.
0188 def write_map_bin_00 (fh, pmap):
0189 
0190     # Magic bytes.
0191     fh.write("TSPMAP00".encode("ascii"))
0192 
0193     # Number of entries.
0194     fh.write(int_bin_32(len(pmap)))
0195 
0196     for ekeys, props in pmap:
0197         # Number of phrase keys and all phrase keys.
0198         fh.write(int_bin_32(len(ekeys)))
0199         for ekey in ekeys:
0200             fh.write(str_bin_32(ekey))
0201 
0202         # Number of properties and all properties.
0203         fh.write(int_bin_32(len(props)))
0204         for pkey, pval in props:
0205             fh.write(str_bin_32(pkey))
0206             fh.write(str_bin_32(pval))
0207 
0208 
0209 # Binary map format 01.
0210 def write_map_bin_01 (fh, pmap):
0211 
0212     offset0 = 0
0213     binint32len = len(int_bin_32(0))
0214     binint64len = len(int_bin_64(0))
0215 
0216     # Magic bytes.
0217     mbytestr = "TSPMAP01".encode("ascii")
0218     offset0 += len(mbytestr)
0219 
0220     # Compute length of binary representation of all entry keys
0221     # additionally equipped with offsets to corresponding property blobs.
0222     offset0 += binint32len
0223     offset0 += binint64len
0224     binekeyslen = 0
0225     for ekeys, d1 in pmap:
0226         binekeyslen += sum([len(str_bin_32(x)) + binint64len for x in ekeys])
0227     offset0 += binekeyslen
0228 
0229     # Construct binary representations of all unique property keys.
0230     offset0 += binint32len
0231     offset0 += binint64len
0232     allpkeys = set()
0233     for d1, props in pmap:
0234         allpkeys.update([x[0] for x in props])
0235     binpkeys = catb(map(str_bin_32, sorted(allpkeys)))
0236     offset0 += len(binpkeys)
0237 
0238     # Construct binary representations of properties for each entry.
0239     # Compute byte offsets for each of these binary blobs, in the given order.
0240     binprops = []
0241     plength = 0
0242     poffset = offset0 + binint32len
0243     for d1, props in pmap:
0244         cbinprops = catb(sum([list(map(str_bin_32, x)) for x in props], []))
0245         cbinprops = catb([int_bin_32(len(props)), int_bin_32(len(cbinprops)),
0246                           cbinprops])
0247         offset = poffset + plength
0248         binprops.append([cbinprops, offset])
0249         poffset = offset
0250         plength = len(cbinprops)
0251 
0252     # Construct binary representations of all entry keys with property offsets.
0253     allekeys = []
0254     binekeys = []
0255     for (ekeys, d1), (d2, offset) in zip(pmap, binprops):
0256         binoffset = int_bin_64(offset)
0257         cbinekeys = catb([str_bin_32(x) + binoffset for x in ekeys])
0258         binekeys.append(cbinekeys)
0259         allekeys.extend(ekeys)
0260     binekeys = catb(binekeys)
0261     assert(binekeyslen == len(binekeys))
0262 
0263     # Write everything out.
0264     fh.write(mbytestr)
0265     fh.write(int_bin_32(len(allekeys)))
0266     fh.write(int_bin_64(len(binekeys)))
0267     fh.write(binekeys)
0268     fh.write(int_bin_32(len(allpkeys)))
0269     fh.write(int_bin_64(len(binpkeys)))
0270     fh.write(binpkeys)
0271     fh.write(int_bin_32(len(pmap)))
0272     for cbinprops, d1 in binprops:
0273         fh.write(cbinprops)
0274 
0275 
0276 def main ():
0277 
0278     if len(sys.argv) != 3:
0279         error("usage: %s INPUT_FILE OUTPUT_FILE" % cmdname)
0280 
0281     try:
0282         import psyco
0283         psyco.full()
0284     except ImportError:
0285         pass
0286 
0287     ifile = sys.argv[1]
0288     ofile = sys.argv[2]
0289 
0290     pmap = read_pmap(ifile)
0291     ofh = open(ofile, "wb")
0292     write_map_bin_01(ofh, pmap)
0293     ofh.close()
0294 
0295 
0296 if __name__ == '__main__':
0297     main()