Warning, /education/step/step/data/extractxml is written in an unsupported language. File is not indexed.

0001 #!/usr/bin/env python3
0002 #
0003 # This file is part of Step.
0004 # SPDX-FileCopyrightText: 2009 Vladimir Kuznetsov <ks.vladimir@gmail.com>
0005 #
0006 # SPDX-License-Identifier: GPL-2.0-or-later
0007 
0008 import xml.parsers.expat
0009 import xml.sax.saxutils
0010 from io import BytesIO
0011 import optparse
0012 import tempfile
0013 import gettext
0014 import locale
0015 import copy
0016 import sys
0017 import re
0018 import os
0019 
0020 # current python gettext module does not
0021 # support contexts, so we implement them ourself
0022 class GNUTranslations(gettext.GNUTranslations):
0023     # The encoding of a msgctxt and a msgid in a .mo file is
0024     # msgctxt + "\x04" + msgid (gettext version >= 0.15)
0025     CONTEXT_ENCODING = "%s\x04%s"
0026 
0027     def upgettext(self, context, message):
0028         ctxt_message_id = self.CONTEXT_ENCODING % (context, message)
0029         missing = object()
0030         tmsg = self._catalog.get(ctxt_message_id, missing)
0031         if tmsg is missing:
0032             if self._fallback:
0033                 return self._fallback.upgettext(context, message)
0034             return str(message)
0035         return tmsg
0036 
0037 class XmlFileTranslator(object):
0038     def __init__(self, opt):
0039         self.opt = opt
0040         self.tag_regex = []
0041         for r in self.opt.tag_regex:
0042             self.tag_regex.append(re.compile(r))
0043 
0044     def init_parser(self):
0045         self.parser = xml.parsers.expat.ParserCreate()
0046         self.parser.ordered_attributes = 1
0047         self.parser.DefaultHandler = self.default_handler
0048         self.parser.StartElementHandler = self.start_element_handler
0049         self.parser.EndElementHandler = self.end_element_handler
0050 
0051     def parse(self, infile):
0052         try:
0053             self.parser.ParseFile(infile)
0054         except xml.parsers.expat.ExpatError as e:
0055             raise
0056 
0057     def translate(self, infile_name, infile, outfile,
0058                 translation, i18n_stack_base=[], i18n_line_base=0):
0059         self.i18n_file = infile_name
0060         self.outfile = outfile
0061         self.translation = translation
0062 
0063         self.i18n_line_base = i18n_line_base
0064         self.i18n_stack_base = i18n_stack_base
0065         self.i18n_stack = []
0066         self.i18n_save = False
0067         self.i18n_string = ''
0068 
0069         self.init_parser()
0070         self.parse(infile)
0071 
0072     def extract(self, infile_name, infile, outfile,
0073                 i18n_stack_base=[], i18n_line_base=0):
0074         self.i18n_file = infile_name
0075         self.outfile = outfile
0076         self.translation = None
0077 
0078         self.i18n_line_base = i18n_line_base
0079         self.i18n_stack_base = []
0080         self.i18n_stack = []
0081         self.i18n_save = False
0082         self.i18n_string = ''
0083 
0084         self.init_parser()
0085         self.parse(infile)
0086 
0087     def parse_unquoted_substring(self, string):
0088         infile = BytesIO(string.encode('UTF-8'))
0089         translator1 = XmlFileTranslator(self.opt.parse_unquoted)
0090         if self.opt.extract:
0091             translator1.extract(self.i18n_file, infile,
0092                     self.outfile, self.i18n_stack, self.i18n_stack[-1]['line'])
0093         else:
0094             outfile = BytesIO()
0095             translator1.translate(self.i18n_file, infile,
0096                 outfile, self.translation, self.i18n_stack, self.i18n_stack[-1]['line'])
0097             string = outfile.getvalue().decode()
0098             if self.opt.unquote:
0099                 string = self.quote_str(string)
0100             self.outfile.write(self.encode_utf8(string))
0101 
0102     def quote_str(self, s):
0103         return s.replace('&', '&amp;').replace('"', '&quot;') \
0104                 .replace('>', '&gt;').replace('<', '&lt;')
0105 
0106     def unquote_str(self, s):
0107         return s.replace('&lt;', '<').replace('&gt;', '>') \
0108                 .replace('&quot;', '"').replace('&amp;', '&')
0109 
0110     def encode_str(self, s):
0111         return '"' + s.replace('\\', '\\\\').replace('\"', '\\"') \
0112                       .replace('\r', '\\r').replace('\n', '\\n"\n"') + '"'
0113 
0114     def encode_utf8(self, s):
0115         if isinstance(s, str):
0116             return s.encode()
0117         else:
0118             return s
0119 
0120     def select_context(self, patterns, attr):
0121         for pattern in patterns:
0122             try:
0123                 return pattern % attr
0124             except (KeyError, ValueError):
0125                 pass
0126 
0127     def write_data(self, data):
0128         if self.i18n_save:
0129             self.i18n_string += data
0130         elif self.translation is not None:
0131             self.outfile.write(self.encode_utf8(data))
0132 
0133     def write_i18n(self):
0134         string = self.i18n_string
0135         if self.opt.unquote:
0136             string = self.unquote_str(string)
0137 
0138         if self.opt.unquote and self.opt.parse_unquoted \
0139                 and string.lstrip().startswith('<'):
0140             self.parse_unquoted_substring(string)
0141 
0142         else:
0143             if self.opt.strip:
0144                 string0 = self.i18n_string.lstrip()
0145                 begin_string = self.i18n_string[:-len(string0)]
0146                 string = string0.rstrip()
0147                 end_string = string0[len(string):]
0148             else:
0149                 string = self.i18n_string
0150                 begin_string = end_string = ''
0151 
0152             if not string:
0153                 return
0154 
0155             info = {'file': self.i18n_file, \
0156                     'filename': os.path.basename(self.i18n_file)}
0157             for n in range(2):
0158                 try:
0159                     d = self.i18n_stack[-1-n]
0160                 except IndexError:
0161                     break
0162                 p = '../'*n
0163                 info[p+'tag'] = d['name']
0164                 info[p+'line'] = d['line']
0165                 for aname, avalue in d['attr'].items():
0166                     info[p+'attr/'+aname] = avalue
0167 
0168             ectx = self.select_context(self.opt.ectx, info)
0169             context = self.select_context(self.opt.context, info)
0170 
0171             if self.translation is not None:
0172                 if context:
0173                     string = self.translation.upgettext(context, string)
0174                 else:
0175                     string = self.translation.ugettext(string)
0176 
0177                 if self.opt.unquote:
0178                     string = self.quote_str(string)
0179                 self.outfile.write(self.encode_utf8(begin_string + string + end_string))
0180 
0181             else:
0182                 self.outfile.write(self.encode_utf8('%s i18n: file: %s:%d\n' % \
0183                          (self.opt.cstart, self.i18n_file, info['line'])))
0184 
0185                 if ectx:
0186                     self.outfile.write(self.encode_utf8('%s i18n: ectx: %s\n' % \
0187                          (self.opt.cstart, ectx)))
0188 
0189                 if context:
0190                     self.outfile.write(self.encode_utf8('i18nc(%s, %s)\n' % \
0191                          (self.encode_str(context), self.encode_str(string))))
0192                 else:
0193                     self.outfile.write(self.encode_utf8('i18n(%s)\n' % \
0194                         (self.encode_str(string),)))
0195 
0196     def default_handler(self, data):
0197         self.write_data(data)
0198 
0199     def start_element_handler(self, name, attr):
0200         data = '<' + name
0201         attr_dict = {}
0202         for n in range(0, len(attr), 2):
0203             attr_dict[attr[n]] = attr[n+1]
0204             data += ' %s=%s' % (attr[n], xml.sax.saxutils.quoteattr(attr[n+1]))
0205         data += '>'
0206 
0207         match = False
0208         if name in self.opt.tag:
0209             match = True
0210         else:
0211             for regex in self.tag_regex:
0212                 if regex.search(name):
0213                     match = True
0214                     break
0215 
0216         if self.i18n_stack and self.opt.recursive:
0217             if match:
0218                 self.write_i18n()
0219                 self.i18n_string = ''
0220                 self.i18n_save = False
0221 
0222         self.write_data(data)
0223 
0224         if match:
0225             self.i18n_stack.append(dict(name=name, attr=attr_dict,
0226                     line=self.i18n_line_base+self.parser.CurrentLineNumber))
0227             self.i18n_save = True
0228 
0229     def end_element_handler(self, name):
0230         if self.i18n_stack and self.i18n_stack[-1]['name'] == name:
0231             if self.opt.recursive or len(self.i18n_stack) == 1:
0232                 self.write_i18n()
0233                 self.i18n_string = ''
0234                 self.i18n_save = False
0235             self.i18n_stack.pop()
0236 
0237         self.write_data('</%s>' % (name,))
0238 
0239         if self.i18n_stack:
0240             self.i18n_stack[-1]['line'] = self.i18n_line_base + \
0241                                     self.parser.CurrentLineNumber
0242             self.i18n_save = True
0243 
0244 def safe_remove(fname):
0245     try:
0246         os.remove(fname)
0247     except (IOError, OSError):
0248         pass
0249 
0250 def open_mo_file(opt, mo_file_name, remove=False):
0251     try:
0252         mo_file = open(mo_file_name, 'rb')
0253     except IOError as e:
0254         sys.stderr.write('Cannot open .mo file: %s\n' % (str(e),))
0255         mo_file.close()
0256         if remove:
0257             safe_remove(mo_file_name)
0258         sys.exit(1)
0259 
0260     try:
0261         translation = GNUTranslations(mo_file)
0262     except IOError as e:
0263         sys.stderr.write('Cannot parse .mo file: %s\n' % (str(e),))
0264         mo_file.close()
0265         if remove:
0266             safe_remove(mo_file_name)
0267         sys.exit(1)
0268 
0269     mo_file.close()
0270     if remove:
0271         safe_remove(mo_file_name)
0272 
0273     return translation
0274 
0275 def compile_po_file(opt, po_file_name):
0276     (mo_file_id, mo_file_name) = tempfile.mkstemp(suffix='.mo')
0277     os.close(mo_file_id)
0278     msgfmt_cmd = 'msgfmt "%s" -o "%s"' % (po_file_name, mo_file_name)
0279 
0280     if os.system(msgfmt_cmd):
0281         sys.stderr.write('Error running msgfmt\n')
0282         sys.exit(1)
0283 
0284     return open_mo_file(opt, mo_file_name, remove=True)
0285 
0286 def decode_options(options, str_options):
0287     enc = locale.getdefaultlocale()[1] or 'UTF8'
0288     for name in str_options:
0289         opt = getattr(options, name)
0290         if isinstance(opt, str):
0291             opt = opt
0292         elif isinstance(opt, list):
0293             opt = [x for x in opt]
0294         setattr(options, name, opt)
0295 
0296 def main():
0297     format_options = [
0298         optparse.make_option('-n', '--tag', action='append', default=[],
0299                 help='Extract TAG constants as i18n string. ' + \
0300                      'Repeat this option to specify multiple tags'),
0301         optparse.make_option('-x', '--tag-regex', action='append', default=[],
0302                 help='Extract contents of all tags matching TAG_REGEX as i18n string. ' + \
0303                      'Repeat this option to specify multiple regex'),
0304         optparse.make_option('-r', '--recursive', action='store_true', default=False,
0305                 help='Recursively pass i18n tags. This means that children tags ' + \
0306                      'will be extracted separately even if parent is also i18n-enabled'),
0307         optparse.make_option('-s', '--strip', action='store_true', default=False,
0308                 help='Strip leading and trailing whitespaces of i18n strings'),
0309         optparse.make_option('-q', '--unquote', action='store_true', default=False,
0310                 help='Unquote XML-quoted entities on extraction ' + \
0311                      'and quote them back when translating'),
0312         optparse.make_option('--parse-unquoted', default=None, metavar='PARSE_UNQUOTED_OPTIONS',
0313                 help='Parse unquoted strings using PARSE_UNQUOTED_OPTIONS as options. '
0314                      'This option is useful when XML file contains quoted HTML fragments')
0315     ]
0316     context_options = [
0317         optparse.make_option('--context', action='append', default=[],
0318                 help='Pattern to generate context. ' + \
0319                      'TODO: pattern syntax. ' + \
0320                      'If specified multiple times, the first matching pattern will be used'),
0321         optparse.make_option('--ectx', action='append', default=[],
0322                 help='Pattern to generate ectx. Format is the same as in --context')
0323     ]
0324 
0325 
0326     optparser = optparse.OptionParser(usage='\n\t%prog --extract [options] XML_FILE...\n' + \
0327                                      '\t%prog --translate [options] XML_FILE...')
0328 
0329     optparser.add_option('-e', '--extract', action='store_true', default=False,
0330                 help='Extract i18n strings from xml files')
0331     optparser.add_option('-t', '--translate', action='store_true', default=False,
0332                 help='Translate i18n strings in xml files')
0333 
0334     optgroup_format = optparse.OptionGroup(optparser, 'Formatting options')
0335     list(map(optgroup_format.add_option, copy.deepcopy(format_options)))
0336     list(map(optgroup_format.add_option, copy.deepcopy(context_options)))
0337     optparser.add_option_group(optgroup_format)
0338 
0339     optgroup_extract = optparse.OptionGroup(optparser, 'Options for extracting messages')
0340     optgroup_extract.add_option('--cstart', default='//',
0341                 help='A string to used to start the comment')
0342     optgroup_extract.add_option('--output', help='Output file for extracted messages')
0343     optgroup_extract.add_option('--xgettext', action='store_true', help='Execute xgettext after extracting messages')
0344     optgroup_extract.add_option('--xgettext-args',
0345                 default='-ki18n -ki18nc:1c,2 -ci18n --no-location --from-code=UTF-8',
0346                 help='Arguments for xgettext (overrides the defaults)')
0347     optgroup_extract.add_option('--xgettext-extra-args', default='',
0348                 help='Additional arguments for xgettext (appends to the defaults)')
0349     optparser.add_option_group(optgroup_extract)
0350 
0351     optgroup_translate = optparse.OptionGroup(optparser, 'Options for translating messages')
0352     optgroup_translate.add_option('--po-file', help='A file with translations')
0353     optgroup_translate.add_option('--mo-file', help='A file with translations')
0354     optgroup_translate.add_option('--output-dir', default='./i18n',
0355                 help='A directory to output translated files')
0356     optparser.add_option_group(optgroup_translate)
0357 
0358     opt, args = optparser.parse_args()
0359     decode_options(opt, ('tag', 'tag_regex', 'context', 'ectx'))
0360 
0361     if not args:
0362         optparser.error('no xml files was specified')
0363 
0364     if opt.extract and opt.translate:
0365         optparser.error('options --extract and --translate are mutually exclusive')
0366     
0367     if not opt.extract and not opt.translate:
0368         optparser.error('please specify either --extract or --translate option')
0369 
0370     if opt.parse_unquoted is not None:
0371         optparser1 = optparse.OptionParser(usage='%prog --parse-unquoted="[options]"')
0372         options = copy.deepcopy(format_options+context_options)
0373         list(map(optparser1.add_option, options))
0374         opt1, args1 = optparser1.parse_args(opt.parse_unquoted.split(' '))
0375         decode_options(opt1, ('tag', 'tag_regex', 'context', 'ectx'))
0376         if args1:
0377             optparser1.error('unexpected argument')
0378         opt.parse_unquoted = copy.deepcopy(opt)
0379         for option in options:
0380             setattr(opt.parse_unquoted, option.dest,
0381                     getattr(opt1, option.dest))
0382         opt.parse_unquoted.parse_unquoted = None
0383 
0384     if opt.extract:
0385         if opt.xgettext:
0386             (tmp_id, tmp_fname) = tempfile.mkstemp(suffix='.cc')
0387             os.close(tmp_id)
0388             outfile = open(tmp_fname, 'wb')
0389         else:
0390             if opt.output:
0391                 try:
0392                     outfile = open(opt.output, 'wb')
0393                 except IOError as e:
0394                     optparser.error('can not open output file: ' + str(e))
0395             else:
0396                 outfile = sys.stdout.buffer
0397     else:
0398         if not opt.po_file and not opt.mo_file:
0399             optparser.error('please specify either --po-file or --mo-file option for translation')
0400 
0401         if opt.po_file:
0402             gnutranslation = compile_po_file(opt, opt.po_file)
0403         else:
0404             gnutranslation = open_mo_file(opt, opt.mo_file)
0405 
0406         if not os.path.isdir(opt.output_dir):
0407             try:
0408                 os.mkdir(opt.output_dir)
0409             except IOError as e:
0410                 sys.stderr.write('Cannot create output directory: %s\n' % (str(e),))
0411                 sys.exit(1)
0412 
0413     translator = XmlFileTranslator(opt)
0414     for fname in args:
0415         try:
0416             infile = open(fname, 'rb')
0417         except IOError as e:
0418             sys.stderr.write('can not open input file: %s\n' % (str(e),))
0419             sys.exit(1)
0420 
0421         if opt.extract:
0422             try:
0423                 translator.extract(fname, infile, outfile)
0424             except xml.parsers.expat.ExpatError as e:
0425                 sys.stderr.write('cannot parse file %s: %s\n' % (fname, str(e)))
0426                 sys.exit(1)
0427 
0428         else:
0429             outfile_name = os.path.join(opt.output_dir, os.path.basename(fname))
0430             try:
0431                 outfile = open(outfile_name, 'wb')
0432             except IOError as e:
0433                 sys.stderr.write('cannot open output file: %s\n' % (str(e),))
0434                 sys.exit(1)
0435 
0436             try:
0437                 translator.translate(fname, infile, outfile, gnutranslation)
0438             except xml.parsers.expat.ExpatError as e:
0439                 sys.stderr.write('can not parse file %s: %s\n' % (fname, str(e)))
0440                 sys.exit(1)
0441 
0442 
0443         infile.close()
0444 
0445     if outfile:
0446         outfile.close()
0447 
0448     if opt.extract and opt.xgettext:
0449         xgettext_cmd = 'xgettext ' + opt.xgettext_args
0450         xgettext_cmd += ' ' + opt.xgettext_extra_args
0451         if opt.output:
0452             xgettext_cmd += ' --output="' + opt.output + '"'
0453         else:
0454             xgettext_cmd += ' --output=-'
0455         xgettext_cmd += ' "' + tmp_fname + '"'
0456         ret = os.system(xgettext_cmd)
0457         if ret != 0:
0458             sys.stderr.write('error running xgettext: exit code = %d' % (ret,))
0459             sys.exit(1)
0460 
0461 if __name__ == '__main__':
0462     main()
0463