kde-dev-scripts/kf5/resolve_kuit.py

0001 #!/usr/bin/env python
0002 # -*- coding: UTF-8 -*-
0003
0004 # Resolve KUIT markup in i18n strings into plain or rich text,
0005 # or switch them to xi18n calls.
0006 #
0007 # Usage:
0008 #   resolve_kuit.py [OPTIONS] FILE_OR_DIRECTORY...
0009 #
0010 # By default, KUIT markup is resolved into plain or rich text.
0011 # To switch strings containing any KUIT markup to xi18n calls instead,
0012 # use -x option; to switch all strings to xi18n calls, use -X option.
0013 # For non-code files (.ui, .rc, etc.) -x behaves same like -X,
0014 # since there is no way to specify by string whether it is to be
0015 # passed through i18n or xi18n call at runtime. Instead this is specified
0016 # on the top level (per file, but normally for all such files in a project),
0017 # as described in the "Connecting Calls to Catalogs" section
0018 # of the ki18n Programmer's Guide.
0019 #
0020 # Files are modified in-place. Modified file paths are written to stdout.
0021 # If an argument is a directory, files from it are recursivelly collected.
0022 # Only files with known extensions are processed (even if file with unknown
0023 # extension is given directly in the command line, it will be ignored).
0024 # The list of known extensions by resolution type can be listed with
0025 # -k option. Option -s RESTYPE:EXT1[,EXT2...] can be used to register
0026 # additional extensions (without leading dot, case ignored) for given
0027 # resolution type. One extension may have several resolution types.
0028 # Files in version control bookkeeping directories are skipped.
0029 #
0030 # In C-like function call files (resolution type 'ccall'),
0031 # i18n strings are detected as arguments in calls with
0032 # *i18n, *i18nc, *i18np, and *i18ncp function names.
0033 # By default detection considers string arguments to be either single or
0034 # double quoted, call arguments can be split into several lines, and
0035 # strings are concatenated when separated only by whitespace.
0036 # Default set of quotes can be replaced by repeating the -q QUOTE option.
0037 #
0038 # In XML-like markup files (resolution type 'xml'),
0039 # i18n strings are detected as element texts, for a certain set of tags.
0040 # i18n contexts are detected as attributes to those elements, for a certain
0041 # set of attributes. These sets can be expanded using -T TAG1[,TAG2...]
0042 # and -A ATTR1[,ATTR2...] options. Case is ignored for both.
0043 # Markup inside the element text is expected to be XML-escaped (&lt;, etc.),
0044 # i.e. the element text is first unescaped before resolution.
0045 #
0046 # In PO files (resolution type 'po'), i18n strings are detected
0047 # according to PO format.
0048 # To process PO files, the Pology library must be ready for use.
0049 # In msgstr fields, KUIT markup transformations for given language
0050 # are looked up in its kdelibs4.po. The pattern path to kdelibs4.po files,
0051 # which contains @lang@ placeholder, is given with -t PATTERN option.
0052 # This can be a local path or a HTTP URL (e.g.
0053 # https://websvn.kde.org/*checkout*/trunk/l10n-kde4/@lang@/messages/kdelibs/kdelibs4.po ).
0054 # Language of processed PO file is determined from its Language: header field.
0055 # If only PO files of one language are processed and they do not reliably
0056 # contain this field, the language can be forced with -l LANG option.
0057 # By default both the original and the translation fields are resolved,
0058 # which is appropriate when the PO file is being resolved before
0059 # it has been merged with new template resulting from the resolved code.
0060 # If an unresolved PO file has been merged with new template first,
0061 # then option -m should be issued to resolve only the translation fields.
0062 # In this case, on fuzzy messages, if previous original fields (which are
0063 # also resolved) and current original fields match after resolution,
0064 # the message is unfuzzied.
0065 #
0066 # For a given i18n string, the decision of whether to resolve KUIT markup
0067 # into plain or Qt rich text is made based on the context marker,
0068 # as described in KUIT documentation at
0069 # https://techbase.kde.org/Development/Tutorials/Localization/i18n_Semantics .
0070 # Target formats can also be manually specified for certain context markers
0071 # by repeating the -f option. E.g. -f @info:progress=rich would override
0072 # the default resolution into plain text for @info:progress i18n strings.
0073 #
0074 # NOTE: [INTERNAL]
0075 # If <html> tags are added on rich text(see top_tag_res variable),
0076 # then resolution must not be run over already resolved files.
0077 # Context markers will remain but format modifiers will be removed from them,
0078 # which may cause further modification in the second run.
0079 #
0080 # NOTE: [INTERNAL]
0081 # If <numid> tags are simply removed (see numid_tag_res variable),
0082 # a warning is issued on each removal to do something manually with
0083 # its associated argument, e.g. wrap it in QString::number().
0084 # It is probably best to look for <numid> tags and handle their arguments
0085 # before running the resolution.
0086
0087 import locale
0088 import optparse
0089 import os
0090 import re
0091 import sys
0092
0093
0094 def main ():
0095
0096     opars = optparse.OptionParser(
0097         usage="%prog FILE_OR_DIRECTORY...",
0098         description="Resolve KUIT markup in i18n strings. "
0099                     "Files are recursively searched for, "
0100                     "and modified in place. "
0101                     "C-like i18n calls are looked for in all files, "
0102                     "except in PO files which are specially treated. "
0103                     "WARNING: Do not run twice over same files.")
0104     opars.add_option(
0105         "-x",
0106         dest="switch_to_xi18n", action="store_const", default=0, const=1,
0107         help="Instead of resolving markup, switch i18n calls having "
0108              "some markup to xi18n calls.")
0109     opars.add_option(
0110         "-X",
0111         dest="switch_to_xi18n", action="store_const", default=0, const=2,
0112         help="Instead of resolving markup, switch all i18n calls "
0113              "to xi18n calls.")
0114     opars.add_option(
0115         "-f",
0116         dest="formats", action="append", default=[],
0117         metavar="MARKER=FORMAT",
0118         help="Set resolution into given target format for "
0119              "strings with this context marker. "
0120              "Target format can be one of: plain, rich. "
0121              "Option can be repeated.")
0122     opars.add_option(
0123         "-q",
0124         dest="quotes", action="append", default=[],
0125         metavar="QUOTE",
0126         help="Set opening and closing quote for string arguments "
0127              "in '%s' resolution type. "
0128              "Default is single and double quote. "
0129              "Option can be repeated." % "ccall")
0130     opars.add_option(
0131         "-s",
0132         dest="add_restype_exts", action="append", default=[],
0133         metavar="RESTYPE:EXT1[,EXT2...]",
0134         help="Set additional file name extension for given resolution type. "
0135              "Option can be repeated.")
0136     opars.add_option(
0137         "-T",
0138         dest="add_xml_texttags", action="store", default=None,
0139         metavar="TAG1[,TAG2...]",
0140         help="Set additional tags from which to collect text "
0141              "in '%s' resolution type." % "xml")
0142     opars.add_option(
0143         "-A",
0144         dest="add_xml_ctxtattrs", action="store", default=None,
0145         metavar="ATTR1[,ATTR2...]",
0146         help="Set additional attributes to consider as containing "
0147              "context in '%s' resolution type." % "xml")
0148     opars.add_option(
0149         "-t",
0150         dest="kdelibs4_path_pattern", action="store", default=None,
0151         metavar="PATH_PATTERN",
0152         help="The path pattern to kdelibs4.po files, "
0153              "which contains @lang@ placeholder. "
0154              "It can be a local path or HTTP URL. "
0155              "Needed only when processing PO files.")
0156     opars.add_option(
0157         "-l",
0158         dest="kdelibs4_lang", action="store", default=None,
0159         metavar="LANG",
0160         help="The language code of translated text in processed PO files, "
0161              "if it cannot be determined reliably from PO headers. "
0162              "When this option is in effect, PO files of exactly "
0163              "one language of translation must be proceesed.")
0164     opars.add_option(
0165         "-w",
0166         dest="msgfmt_wrap", action="store_true", default=False,
0167         help="Apply Gettext tools wrapping to PO files after resolving them.")
0168     opars.add_option(
0169         "-m",
0170         dest="post_merge", action="store_true", default=False,
0171         help="Resolve only translation fields in PO files. "
0172              "This is to be used when PO file is being resolved "
0173              "after it has been merged with template resulting "
0174              "from resolved code.")
0175     opars.add_option(
0176         "-I",
0177         dest="interface_wrap", action="store", default=None,
0178         metavar="HEAD_SEP",
0179         help="[undocumented]",
0180     )
0181     opars.add_option(
0182         "-k",
0183         dest="list_restypes", action="store_true", default=False,
0184         help="List known resolution types and associated file extensions. "
0185              "It will include additions by '%s' option." % "-s",
0186     )
0187
0188     options, args = opars.parse_args()
0189
0190     # Set additional resolution types.
0191     for rtextspec in options.add_restype_exts:
0192         lst = rtextspec.split(":", 1)
0193         if len(lst) != 2:
0194             raise StandardError(
0195                 "Resolution specification '%s' given in command line "
0196                 "is not valid." % rtextspec)
0197         rt, extspec = lst
0198         if rt not in _map_restype_ext:
0199             raise StandardError(
0200                 "Unknown resolution type '%s' in resolution specification '%s' "
0201                 "given in command line." % (rt, rtextspec))
0202         exts = [e.lower() for e in extspec.split(",")]
0203         _map_restype_ext[rt][0].update(exts)
0204     if options.list_restypes:
0205         for rt, (exts, rf, ons) in _map_restype_ext.items():
0206             report("%s: %s" % (rt, " ".join(sorted(exts))))
0207         exit(1)
0208
0209     # Update target format by context marker specification.
0210     for fmtspec in options.formats:
0211         try:
0212             cmk, fmt = fmtspec.split("=", 1)
0213         except:
0214             raise StandardError(
0215                 "Invalid target format specification '%s' "
0216                 "given in command line." % fmtspec)
0217         if fmt not in _known_formats.values():
0218             raise StandardError(
0219                 "Unknown target format '%s' given in command line." % fmt)
0220         _cmarker_to_format[cmk] = fmt
0221
0222     # Set KUIT resolving options.
0223     if options.kdelibs4_path_pattern:
0224         if "@lang@" not in options.kdelibs4_path_pattern:
0225             raise StandardError(
0226                 "Path pattern for kdelibs4.po files given in command line "
0227                 "does not contain %s placeholder." % "@lang@")
0228     _kuit_spec.kdelibs4_path_pattern = options.kdelibs4_path_pattern
0229     _kuit_spec.force_lang = options.kdelibs4_lang
0230     _kuit_spec.interface_wrap = options.interface_wrap
0231
0232     # Set C-call resolving options.
0233     _ccall_options.switch_to_xi18n = options.switch_to_xi18n
0234     if options.quotes:
0235         squotes = list(reversed(sorted(options.quotes))) # longest first
0236         _ccall_options.quotes[:] = squotes
0237
0238     # Set XML resolving options.
0239     _xml_options.switch_to_xi18n = options.switch_to_xi18n
0240     if options.add_xml_texttags:
0241         tags = options.add_xml_texttags.split(",")
0242         _xml_options.text_tags.update(tags)
0243     if options.add_xml_ctxtattrs:
0244         attrs = options.add_xml_ctxtattrs.split(",")
0245         _xml_options.ctxt_attrs[:0] = attrs # higher priority
0246
0247     # Set PO resolving options.
0248     _po_options.switch_to_xi18n = options.switch_to_xi18n
0249     _po_options.msgfmt_wrap = options.msgfmt_wrap
0250     _po_options.post_merge = options.post_merge
0251
0252     # Collect all files.
0253     file_paths = []
0254     for path in args:
0255         if os.path.isdir(path):
0256             for root, dirns, filens in os.walk(path):
0257                 for filen in filens:
0258                     file_paths.append(os.path.join(root, filen))
0259         elif os.path.isfile(path):
0260             file_paths.append(path)
0261         else:
0262             raise StandardError(
0263                 "Command line argument '%s' is neither a file "
0264                 "nor a directory." % path)
0265
0266     # Filter out VCS bookkeeping.
0267     mod_file_paths = []
0268     vcs_dirns = set(["CVS", ".svn", ".git"])
0269     for fp in file_paths:
0270         els = set(fp.split(os.path.sep))
0271         if not els.intersection(vcs_dirns):
0272             mod_file_paths.append(fp)
0273     file_paths = mod_file_paths
0274
0275     # Resolve files.
0276     file_paths.sort()
0277     test_encs = ["utf8", "iso8859-1", "iso8859-15", "cp1252"]
0278     for fp in file_paths:
0279         rspecs = get_resolvers_for_file(fp)
0280         modified = False
0281         for restype, resolvef, onstring in rspecs:
0282             if onstring:
0283                 fstr = open(fp, "rb").read()
0284                 badpos = -1
0285                 for fenc in test_encs:
0286                     try:
0287                         fstr = fstr.decode(fenc)
0288                     except UnicodeDecodeError, e:
0289                         if badpos < 0:
0290                             badpos = e.start
0291                     else:
0292                         badpos = -1
0293                         break
0294                 if badpos < 0:
0295                     res_fstr = resolvef(fstr, fp)
0296                     if res_fstr != fstr:
0297                         tmpfp = fp + "~tmp"
0298                         fh = open(tmpfp, "wb")
0299                         fh.write(res_fstr.encode("utf8"))
0300                         fh.close()
0301                         os.rename(tmpfp, fp)
0302                         modified = True
0303                 else:
0304                     warning("%s: Cannot decode file using any of "
0305                             "test encodings (UTF-8 try produces problem "
0306                             "in line %d, column %d), skipping it."
0307                             % (fp, lno_to(fstr, badpos), cno_to(fstr, badpos)))
0308             else:
0309                 if resolvef(fp):
0310                     modified = True
0311         if modified:
0312             report(fp)
0313
0314
0315 def report (msg):
0316
0317     lenc = locale.getpreferredencoding()
0318     emsg = ("%s\n" % msg).encode(lenc)
0319     sys.stdout.write(emsg)
0320
0321
0322 def warning (msg):
0323
0324     lenc = locale.getpreferredencoding()
0325     emsg = ("[warning] %s\n" % msg).encode(lenc)
0326     sys.stderr.write(emsg)
0327
0328
0329 class Data: pass
0330 _kuit_spec = Data()
0331 _kuit_spec.kdelibs4_path_pattern = None
0332 _kuit_spec.force_lang = None
0333 _kuit_spec.interface_wrap = None
0334 _kuit_spec.langdata = {}
0335
0336
0337 _space_in_place_tag_rx = re.compile(r"(<[^>]*\S)(/\s*>)", re.U | re.S)
0338
0339 def get_language_data (lang):
0340
0341     langdata = _kuit_spec.langdata.get(lang)
0342     if langdata:
0343         return langdata
0344
0345     kl4cat = None
0346     if lang != "en_US":
0347         # Fetch kdelibs4.po for this catalog's language.
0348         if not _kuit_spec.kdelibs4_path_pattern:
0349             raise StandardError(
0350                 "Path pattern for kdelibs4.po not set (-t option).")
0351         kl4path = _kuit_spec.kdelibs4_path_pattern.replace("@lang@", lang)
0352         from urllib import urlopen
0353         kl4fh = urlopen(kl4path)
0354         from pology.catalog import Catalog
0355         kl4cat = Catalog("kdelibs4.po", readfh=kl4fh)
0356
0357     langdata = Data()
0358
0359     langdata.transform = {}
0360     for spec in _kuit_transforms.items():
0361         ktrkey, (msgctxt, msgid, subsmap, prepend, postpend, textmodf) = spec
0362         pattern = msgid
0363         if kl4cat is not None:
0364             msgs = kl4cat.select_by_key(msgctxt, msgid)
0365             if msgs and msgs[0].translated:
0366                 pattern = msgs[0].msgstr[0]
0367         fmt = ktrkey[2]
0368         if fmt == "rich":
0369             # Add space before /> in in-place closed rich-text tags,
0370             # as Qt may fail to guess format as rich-text otherwise.
0371             pattern = _space_in_place_tag_rx.sub(r"\1 \2", pattern)
0372         tr = Data()
0373         tr.pattern = pattern
0374         tr.subsmap = subsmap
0375         tr.prepend = prepend
0376         tr.postpend = postpend
0377         tr.textmodf = textmodf
0378         langdata.transform[ktrkey] = tr
0379
0380     langdata.shcdelim = {}
0381     for spec in _kuit_shortcut_delimiters.items():
0382         fmt, (msgctxt, msgid) = spec
0383         delim = msgid
0384         if kl4cat is not None:
0385             msgs = kl4cat.select_by_key(msgctxt, msgid)
0386             if msgs and msgs[0].translated:
0387                 delim = msgs[0].msgstr[0]
0388         langdata.shcdelim[fmt] = delim
0389
0390     langdata.keyname = {}
0391     for spec in _kuit_key_names:
0392         msgctxt, msgid = spec
0393         keyname = msgid
0394         if kl4cat is not None:
0395             msgs = kl4cat.select_by_key(msgctxt, msgid)
0396             if msgs and msgs[0].translated:
0397                 keyname = msgs[0].msgstr[0]
0398         langdata.keyname[msgid] = keyname
0399
0400     langdata.guidelim = {}
0401     for spec in _kuit_guipath_delimiters.items():
0402         fmt, (msgctxt, msgid) = spec
0403         delim = msgid
0404         if kl4cat is not None:
0405             msgs = kl4cat.select_by_key(msgctxt, msgid)
0406             if msgs and msgs[0].translated:
0407                 delim = msgs[0].msgstr[0]
0408         langdata.guidelim[fmt] = delim
0409
0410     langdata.ifacewrap = None
0411     if _kuit_spec.interface_wrap:
0412         langdata.ifacewrap = _kuit_spec.interface_wrap
0413
0414     _kuit_spec.langdata[lang] = langdata
0415     return langdata
0416
0417
0418 def lno_to (fstr, p):
0419     lno = fstr.count("\n", 0, p) + 1
0420     return lno
0421
0422
0423 def cno_to (fstr, p):
0424     pb = fstr.rfind("\n", 0, p)
0425     # If no \n found, -1 is exactly what's needed below.
0426     cno = p - pb
0427     return cno
0428
0429
0430 _ccall_options = Data()
0431
0432 # Call specification.
0433 _ccall_options.calls = {
0434     # "callname": (ctxt_pos, text_pos, plural_pos)
0435     "i18n": (-1, 0, -1),
0436     "i18nc": (0, 1, -1),
0437     "i18np": (-1, 0, 1),
0438     "i18ncp": (0, 1, 2),
0439     "ki18n": (-1, 0, -1),
0440     "ki18nc": (0, 1, -1),
0441     "ki18np": (-1, 0, 1),
0442     "ki18ncp": (0, 1, 2),
0443     "I18N_NOOP": (-1, 0, -1),
0444     "I18N_NOOP2": (0, 1, -1),
0445     "I18N_NOOP2_NOSTRIP": (0, 1, -1),
0446 }
0447 # Equip with total number of strings.
0448 _ccall_options.calls = dict([(cn, inds + (len([i for i in inds if i >= 0]),))
0449                              for cn, inds in _ccall_options.calls.items()])
0450
0451 # Default string quotes (must be sorted from longest to shortest).
0452 _ccall_options.quotes = list(reversed(sorted([
0453     "\"",
0454     "'",
0455 ])))
0456
0457 # To-EOL and delimited comments which may mingle with
0458 # concatenated string literals.
0459 _ccall_options.midcstr_eolcmnts = set([
0460     "//", "#",
0461 ])
0462 _ccall_options.midcstr_delimcmnts = set([
0463     ("/*", "*/"),
0464 ])
0465
0466 _ccall_head_rx = re.compile(r"([\w\d_]+)\s*\(", re.U | re.S)
0467 _mask_chr = "\x04"
0468 _print_mask_chr = u"¬"
0469
0470 def resolve_ccall (fstr, path):
0471
0472     showparse = False
0473     if showparse:
0474         report("%s: >>>>> start >>>>>" % path)
0475
0476     langdata = get_language_data("en_US")
0477     toxi18n = _ccall_options.switch_to_xi18n
0478
0479     segs = []
0480     p1 = 0
0481     while True:
0482         m = _ccall_head_rx.search(fstr, p1)
0483         if not m:
0484             segs.append(fstr[p1:])
0485             break
0486         p2, p3 = m.span()
0487         callname = m.group(1)
0488         callspec = _ccall_options.calls.get(callname)
0489         if callspec:
0490             ictxt, itext, iplural, total = callspec
0491             p1a = p3
0492             argspecs = []
0493             all_strings = True
0494             end_call = False
0495             for k in range(total):
0496                 if showparse:
0497                     report("%s:%d: iarg=%d  spos=%d"
0498                            % (path, lno_to(fstr, p1a), k, p1a))
0499                 ret = _parse_cstr(fstr, p1a, (",", ")"), path,
0500                                   _ccall_options.midcstr_eolcmnts,
0501                                   _ccall_options.midcstr_delimcmnts)
0502                 if not ret:
0503                     all_strings = False
0504                     break
0505                 p2a, msarg, quote, outs = ret
0506                 argspecs.append((msarg, quote, outs))
0507                 p1a = p2a
0508                 if outs[-1].endswith(")"):
0509                     end_call = True
0510                     break
0511             if len(argspecs) == total:
0512                 if showparse:
0513                     report("%s:%d: call=[%s]%s"
0514                            % (path, lno_to(fstr, p3), callname,
0515                               "".join("{%s||%s}" % (_ppmasked(s[0]), s[1])
0516                                       for s in argspecs)))
0517                 csegs = []
0518                 lno = lno_to(fstr, p3)
0519                 mctxt = argspecs[ictxt][0] if ictxt >= 0 else None
0520                 res_callname = None
0521                 for iarg, (msarg, quote, outs) in enumerate(argspecs):
0522                     if iarg != ictxt:
0523                         ret = resolve_kuit(mctxt, msarg, quote,
0524                                            langdata, path, lno,
0525                                            toxi18n=toxi18n)
0526                         res_mctxt, res_msarg, xi18n = ret[:3]
0527                         if xi18n and not res_callname:
0528                             if callname.startswith("i"):
0529                                 res_callname = "x" + callname
0530                             elif callname.startswith("k"):
0531                                 res_callname = "kx" + callname[1:]
0532                         res_sarg = _unmask(res_msarg, outs)
0533                         csegs.append(res_sarg)
0534                     else:
0535                         csegs.append("")
0536                 if not res_callname:
0537                     res_callname = callname
0538                 if ictxt >= 0:
0539                     outs_ctxt = argspecs[ictxt][2]
0540                     res_ctxt = _unmask(res_mctxt, outs_ctxt)
0541                     csegs[ictxt] = res_ctxt
0542                 if showparse:
0543                     report("%s:%d: res-segs=%s"
0544                            % (path, lno_to(fstr, p3),
0545                               "".join("{%s}" % s for s in csegs)))
0546                 segs.append(fstr[p1:p2])
0547                 segs.append(res_callname)
0548                 segs.append(fstr[p2 + len(callname):p3])
0549                 segs.append("".join(csegs))
0550                 p3 = p1a
0551             elif all_strings and end_call:
0552                 if showparse:
0553                     report("%s:%d: bad-call" % (path, lno_to(fstr, p3)))
0554                 warning("%s:%d: Too little string arguments to call "
0555                         "(expected %d, got %d)."
0556                         % (path, lno_to(fstr, p3), total, len(argspecs)))
0557                 segs.append(fstr[p1:p3])
0558                 p3 = p1a
0559             else:
0560                 if showparse:
0561                     report("%s:%d: not-literal-call" % (path, lno_to(fstr, p3)))
0562                 segs.append(fstr[p1:p3])
0563         else:
0564             segs.append(fstr[p1:p3])
0565         p1 = p3
0566     res_fstr = "".join(segs)
0567     if showparse:
0568         report("%s: <<<<< end <<<<<" % path)
0569     return res_fstr
0570
0571
0572 def _ppmasked (s):
0573
0574     return s.replace(_mask_chr, _print_mask_chr)
0575
0576
0577 def _unmask (ms, outs):
0578
0579     segs = []
0580     p1 = 0
0581     io = 0
0582     while True:
0583         p2 = ms.find(_mask_chr, p1)
0584         if p2 < 0:
0585             segs.append(ms[p1:])
0586             break
0587         segs.append(ms[p1:p2])
0588         segs.append(outs[io])
0589         io += 1
0590         p1 = p2 + len(_mask_chr)
0591     s = "".join(segs)
0592     return s
0593
0594
0595 def _parse_cstr (fstr, spos, ends, path=None, eolcmnts=[], delimcmnts=[]):
0596
0597     showparse = False
0598
0599     l = len(fstr)
0600     p = spos
0601     if showparse:
0602         report("parse-cstr-start %d" % p)
0603     segs = []
0604     outs = []
0605     quote = None
0606     while True:
0607         pp = p
0608         while p < l and fstr[p].isspace():
0609             p += 1
0610         segs.append(_mask_chr)
0611         outs.append(fstr[pp:p])
0612         if p == l:
0613             break
0614         at_quote = False
0615         if quote is None:
0616             for q in _ccall_options.quotes:
0617                 if fstr[p:p + len(q)] == q:
0618                     at_quote = True
0619                     quote = q
0620                     lq = len(quote)
0621                     break
0622         else:
0623             if fstr[p:p + lq] == quote:
0624                 at_quote = True
0625         if at_quote:
0626             pp = p
0627             p += lq
0628             p = find_esc(fstr, quote, "\\", p)
0629             if p < 0:
0630                 if path:
0631                     warning("%s:%d: Unterminated string literal."
0632                             % (path, lno_to(fstr, pp)))
0633                 return None
0634             p += lq
0635             segs.append(fstr[pp:p])
0636             if showparse:
0637                 report("parse-cstr-quote-end %d" % p)
0638             continue
0639         at_end = False
0640         for end in ends:
0641             if fstr[p:p + len(end)] == end:
0642                 pp = p
0643                 p += len(end)
0644                 at_end = True
0645                 segs.append(_mask_chr)
0646                 outs.append(fstr[pp:p])
0647                 if showparse:
0648                     report("parse-cstr-end-end %d" % p)
0649                 break
0650         if at_end:
0651             break
0652         cmnt_end = False
0653         for ec in eolcmnts:
0654             if fstr[p:p + len(ec)] == ec:
0655                 pp = p
0656                 p += len(ec)
0657                 while p < l and fstr[p] != "\n":
0658                     p += 1
0659                 if p < l:
0660                     p += 1
0661                 cmnt_end = True
0662                 segs.append(_mask_chr)
0663                 outs.append(fstr[pp:p])
0664                 if showparse:
0665                     report("parse-cstr-eol-cmnt-end %d" % p)
0666                 break
0667         if cmnt_end:
0668             continue
0669         for dc1, dc2 in delimcmnts:
0670             if fstr[p:p + len(dc1)] == dc1:
0671                 pp = p
0672                 p += len(dc1)
0673                 while p < l and fstr[p:p + len(dc2)] != dc2:
0674                     p += 1
0675                 if p == l:
0676                     warning("%s:%d: Unterminated comment."
0677                             % (path, lno_to(fstr, pp)))
0678                     return None
0679                 p += len(dc2)
0680                 cmnt_end = True
0681                 segs.append(_mask_chr)
0682                 outs.append(fstr[pp:p])
0683                 if showparse:
0684                     report("parse-cstr-delim-cmnt-end %d" % p)
0685                 break
0686         if cmnt_end:
0687             continue
0688         break
0689     if quote is None:
0690         return None
0691
0692     mstr = "".join(segs)
0693     return p, mstr, quote, outs
0694
0695
0696 _xml_options = Data()
0697
0698 # Default tags and attributes to extract from.
0699 # Ordering of attributes is significant, first found is taken as context.
0700 # According to extractrc from kdesdk/scripts/.
0701 _xml_options.text_tags = set([
0702     "text", "title", "string", "whatsthis", "tooltip", "label",
0703 ])
0704 _xml_options.ctxt_attrs = [
0705     "context", "comment",
0706 ]
0707
0708 _xml_rx = Data()
0709 _xml_rx.inited = False
0710 def _init_xml_regexes ():
0711     if _xml_rx.inited:
0712         return
0713     tagins = "|".join(sorted(_xml_options.text_tags))
0714     rx = re.compile(r"<\s*(%s)\b([^>]*)>([^<]*)<\s*/\s*\1\s*>" % tagins,
0715                     re.U | re.S | re.I)
0716     _xml_rx.i18n_el = rx
0717     attrins = "|".join(_xml_options.ctxt_attrs)
0718     rx = re.compile(r"""^(.*\b(?:%s)\s*=\s*['"])(.*?)(['"].*)$""" % attrins,
0719                     re.U | re.S | re.I)
0720     _xml_rx.ctxt_attr = rx
0721     _xml_rx.inited = True
0722
0723
0724 def resolve_xml (fstr, path):
0725
0726     showparse = False
0727     if showparse:
0728         report("%s: >>>>> start >>>>>" % path)
0729
0730     _init_xml_regexes()
0731     langdata = get_language_data("en_US")
0732     toxi18n = _xml_options.switch_to_xi18n
0733
0734     segs = []
0735     p1 = 0
0736     while True:
0737         m = _xml_rx.i18n_el.search(fstr, p1)
0738         if not m:
0739             segs.append(fstr[p1:])
0740             break
0741         p2, p3 = m.span()
0742         lno = lno_to(fstr, p2)
0743         segs.append(fstr[p1:p2])
0744         tag, attr_str, etext = m.groups()
0745         ctxt = None
0746         m = _xml_rx.ctxt_attr.search(attr_str)
0747         if m:
0748             attr_head, ectxt, attr_tail = m.groups()
0749             ctxt, noesc_ctxt = unescape_xml(ectxt, testnoesc=True)
0750         text, noesc_text = unescape_xml(etext, testnoesc=True)
0751         if showparse:
0752             if ctxt is not None:
0753                 report("%s:%d: ctxt-text={%s}{%s}" % (path, lno, ectxt, etext))
0754             else:
0755                 report("%s:%d: text={%s}" % (path, lno, etext))
0756         ret = resolve_kuit(ctxt, text, None, langdata, path, lno,
0757                            toxi18n=toxi18n)
0758         res_ctxt, res_text = ret[:2]
0759         res_etext = escape_xml(res_text, noesc=noesc_text)
0760         if ctxt is not None:
0761             res_ectxt = escape_xml(res_ctxt, noesc=noesc_ctxt)
0762             seg = ("<%s%s%s%s>%s</%s>"
0763                    % (tag, attr_head, res_ectxt, attr_tail, res_etext, tag))
0764         else:
0765             seg = "<%s%s>%s</%s>" % (tag, attr_str, res_etext, tag)
0766         if showparse:
0767             if ctxt is not None:
0768                 report("%s:%d: res-ctxt-text={%s}{%s}"
0769                        % (path, lno, res_ectxt, res_etext))
0770             else:
0771                 report("%s:%d: res-text={%s}" % (path, lno, res_etext))
0772         segs.append(seg)
0773         p1 = p3
0774     res_fstr = "".join(segs)
0775
0776     if showparse:
0777         report("%s: <<<<< end <<<<<" % path)
0778     return res_fstr
0779
0780
0781 _po_options = Data()
0782 _po_options.msgfmt_wrap = False
0783
0784 def resolve_po (path):
0785
0786     from pology.catalog import Catalog
0787     from pology.gtxtools import msgfilter
0788
0789     cat = Catalog(path)
0790
0791     langdata_src = get_language_data("en_US")
0792     lang = _kuit_spec.force_lang or cat.language()
0793     if not lang:
0794         raise StandardError(
0795             "%s: Cannot determine language of PO file." % path)
0796     langdata_trn = get_language_data(lang)
0797     toxi18n_global = _po_options.switch_to_xi18n
0798
0799     seen_keys = set()
0800     for ind, msg in enumerate(cat):
0801         toxi18n = toxi18n_global
0802         # Override resolution setting by message xi18n flag.
0803         if "kde-kuit-format" in msg.flag:
0804             toxi18n = 2
0805         # Original fields.
0806         ctxt = msg.msgctxt
0807         forcerich = False
0808         if not _po_options.post_merge:
0809             ret = resolve_kuit(ctxt, msg.msgid, None,
0810                                langdata_src, path, msg.refline,
0811                                toxi18n=toxi18n)
0812             msg.msgid = ret[1]
0813             if ctxt is not None:
0814                 msg.msgctxt = ret[0]
0815             if msg.msgid_plural is not None:
0816                 ret = resolve_kuit(ctxt, msg.msgid_plural, None,
0817                                    langdata_src, path, msg.refline,
0818                                    toxi18n=toxi18n)
0819                 msg.msgid_plural = ret[1]
0820         else:
0821             # Check if to not touch existing KUIT or
0822             # to force rich text in non-original fields.
0823             if not forcerich:
0824                 ret = resolve_kuit(ctxt, msg.msgid, None,
0825                                    langdata_src, path, msg.refline,
0826                                    toxi18n=toxi18n)
0827                 has_any_html_tag, has_any_kuit_tag = ret[3:5]
0828                 if has_any_kuit_tag:
0829                     toxi18n = 2
0830                 else:
0831                     forcerich = has_any_html_tag
0832             if not forcerich:
0833                 ret = resolve_entities(msg.msgid, path, msg.refline)
0834                 any_entity_resolved = ret[1]
0835                 forcerich = any_entity_resolved
0836         # Previous original fields.
0837         ctxt_prev = msg.msgctxt_previous
0838         has_previous = False
0839         if msg.msgid_previous is not None:
0840             has_previous = True
0841             ret = resolve_kuit(ctxt_prev, msg.msgid_previous, None,
0842                                langdata_src, path, msg.refline,
0843                                toxi18n=toxi18n, forcerich=forcerich)
0844             msg.msgid_previous = ret[1]
0845             if ctxt_prev is not None:
0846                 msg.msgctxt_previous = ret[0]
0847             if msg.msgid_plural_previous is not None:
0848                 ret = resolve_kuit(ctxt_prev, msg.msgid_plural_previous, None,
0849                                    langdata_src, path, msg.refline,
0850                                    toxi18n=toxi18n, forcerich=forcerich)
0851                 msg.msgid_plural_previous = ret[1]
0852         # Translation fields.
0853         ctxt_trn = ctxt if (not msg.fuzzy or not has_previous) else ctxt_prev
0854         for i in range(len(msg.msgstr)):
0855             ret = resolve_kuit(ctxt_trn, msg.msgstr[i], None,
0856                                langdata_trn, path, msg.refline,
0857                                toxi18n=toxi18n, forcerich=forcerich)
0858             msg.msgstr[i] = ret[1]
0859             if msg.translated:
0860                 if msg.msgid.endswith("\n") and not msg.msgstr[i].endswith("\n"):
0861                     msg.msgstr[i] += "\n"
0862                 elif not msg.msgid.endswith("\n") and msg.msgstr[i].endswith("\n"):
0863                     msg.msgstr[i] = msg.msgstr[i][:-1]
0864         # In post-merge mode, maybe it can be unfuzzied now.
0865         if _po_options.post_merge and msg.fuzzy and all(list(msg.msgstr)):
0866             if (    msg.msgctxt == msg.msgctxt_previous
0867                 and msg.msgid == msg.msgid_previous
0868                 and msg.msgid_plural == msg.msgid_plural_previous
0869             ):
0870                 msg.unfuzzy()
0871         # Conversion may make a message with same key as a previous one,
0872         # remove the current message in that case.
0873         if msg.key in seen_keys:
0874             cat.remove_on_sync(ind)
0875         else:
0876             seen_keys.add(msg.key)
0877
0878     modified = cat.sync()
0879     if modified and _po_options.msgfmt_wrap:
0880         msgfilter(["cat"])(cat.filename)
0881
0882     return modified
0883
0884
0885 _map_restype_ext = {
0886     "ccall": (set([
0887         "cpp", "cxx", "cc", "c",
0888         "h", "hpp", "hxx", "hh",
0889         "py", "js", "rb", "qml",
0890         #"kcfg", won't work due to XML escaping; but there is
0891         # no existing case of embedded i18n() with KUIT in KDE repos.
0892     ]), resolve_ccall, True),
0893
0894     "xml": (set([
0895         "ui", "rc", "kcfg",
0896     ]), resolve_xml, True),
0897
0898     "po": (set([
0899         "po", "pot",
0900     ]), resolve_po, False),
0901 }
0902 # Inverted resolution types by extension.
0903 _map_ext_restype = {}
0904 def _init_map_ext_restype ():
0905     if _map_ext_restype:
0906         return
0907     for rt, (exts, rf, ons) in _map_restype_ext.items():
0908         for ext in exts:
0909             if ext not in _map_ext_restype:
0910                 _map_ext_restype[ext] = []
0911             _map_ext_restype[ext].append((rt, rf, ons))
0912
0913
0914 def get_resolvers_for_file (path):
0915
0916     _init_map_ext_restype()
0917     p = path.rfind(".")
0918     if p >= 0:
0919         ext = path[p + 1:]
0920     else:
0921         ext = ""
0922     rspecs = _map_ext_restype.get(ext, [])
0923     return rspecs
0924
0925
0926 # KUIT keyboard shortcut delimiters and lookup key in PO files, as
0927 # format: (msgctxt, msgid).
0928 # According to kuitsemantics.cpp from kdecore.
0929 _kuit_raw_shortcut_delimiter_rx = re.compile(r"\+|-", re.U)
0930 _kuit_shortcut_delimiters = {
0931     "plain": (u"shortcut-key-delimiter/plain", u"+"),
0932     "rich": (u"shortcut-key-delimiter/rich", u"+"),
0933 }
0934 # Add delimiters for term format, same as plain.
0935 _kuit_shortcut_delimiters["term"] = _kuit_shortcut_delimiters["plain"]
0936
0937 # KUIT keyboard key names and lookup in PO files,
0938 # as set((msgctxt, msgid)). F%1 is special.
0939 _kuit_key_names_raw = set([
0940     u"Alt", u"AltGr", u"Backspace", u"CapsLock", u"Control", u"Ctrl",
0941     u"Del", u"Delete", u"Down", u"End", u"Enter", u"Esc", u"Escape",
0942     u"Home", u"Hyper", u"Ins", u"Insert", u"Left", u"Menu", u"Meta",
0943     u"NumLock", u"PageDown", u"PageUp", u"PgDown", u"PgUp", u"PauseBreak",
0944     u"PrintScreen", u"PrtScr", u"Return", u"Right", u"ScrollLock", u"Shift",
0945     u"Space", u"Super", u"SysReq", u"Tab", u"Up", u"Win", u"F%1",
0946 ])
0947 _kuit_key_names = set((u"keyboard-key-name", kn) for kn in _kuit_key_names_raw)
0948
0949 def textmod_shortcut (text, quote, fmt, langdata):
0950
0951     segs = []
0952     p1 = 0
0953     while True:
0954         m = _kuit_raw_shortcut_delimiter_rx.search(text, p1)
0955         if not m:
0956             keyname = text[p1:].strip()
0957         else:
0958             p2, p3 = m.span()
0959             keyname = text[p1:p2].strip()
0960         if keyname[:1] == "F" and keyname[1:].isdigit():
0961             lkeypattern = langdata.keyname.get(u"F%1", u"F%1")
0962             lkeyname = lkeypattern.replace("%1", keyname[1:])
0963         else:
0964             lkeyname = langdata.keyname.get(keyname, keyname)
0965         segs.append(lkeyname)
0966         if not m:
0967             break
0968         segs.append(langdata.shcdelim[fmt])
0969         p1 = p3
0970     res_text = "".join(segs)
0971     if quote:
0972         res_text = escape_c(res_text, quote)
0973     return res_text
0974
0975
0976 # KUIT UI path delimiters and lookup key in PO files, as
0977 # format: (msgctxt, msgid).
0978 # According to kuitsemantics.cpp from kdecore.
0979 _kuit_raw_guipath_delimiter_rx = re.compile(r"->", re.U)
0980 _kuit_guipath_delimiters = {
0981     "plain": (u"gui-path-delimiter/plain", u"→"),
0982     "rich": (u"gui-path-delimiter/rich", u"→"),
0983 }
0984 # Add delimiters for term format, same as plain.
0985 _kuit_guipath_delimiters["term"] = _kuit_guipath_delimiters["plain"]
0986
0987 def textmod_interface (text, quote, fmt, langdata):
0988
0989     segs = []
0990     p1 = 0
0991     while True:
0992         m = _kuit_raw_guipath_delimiter_rx.search(text, p1)
0993         if not m:
0994             pathel = text[p1:].strip()
0995         else:
0996             p2, p3 = m.span()
0997             pathel = text[p1:p2].strip()
0998         if langdata.ifacewrap:
0999             head, sep = langdata.ifacewrap[:-1], langdata.ifacewrap[-1:]
1000             pathel = "%s%s%s" % (head, pathel, sep)
1001         segs.append(pathel)
1002         if not m:
1003             break
1004         segs.append(langdata.guidelim[fmt])
1005         p1 = p3
1006     res_text = "".join(segs)
1007     if quote:
1008         res_text = escape_c(res_text, quote)
1009     return res_text
1010
1011
1012 # KUIT transformation patterns and lookup key in PO files, as
1013 # (tag, attributes, format): (msgctxt, msgid, subsmap, prepend, postpend, textmodf).
1014 # According to kuitsemantics.cpp from kdecore.
1015 _kuit_transforms = {
1016     (u"title", frozenset([]), "plain"):
1017         (u"@title/plain",
1018          u"== %1 ==",
1019          {"%1": "title"},
1020          "", "\n",
1021          None),
1022     (u"title", frozenset([]), "rich"):
1023         (u"@title/rich",
1024          u"<h2>%1</h2>",
1025          {"%1": "title"},
1026          "", "",
1027          None),
1028     (u"subtitle", frozenset([]), "plain"):
1029         (u"@subtitle/plain",
1030          u"~ %1 ~",
1031          {"%1": "subtitle"},
1032          "", "\n",
1033          None),
1034     (u"subtitle", frozenset([]), "rich"):
1035         (u"@subtitle/rich",
1036          u"<h3>%1</h3>",
1037          {"%1": "subtitle"},
1038          "", "",
1039          None),
1040     (u"para", frozenset([]), "plain"):
1041         (u"@para/plain",
1042          u"%1",
1043          {"%1": "para"},
1044          "", "\n",
1045          None),
1046     (u"para", frozenset([]), "rich"):
1047         (u"@para/rich",
1048          u"<p>%1</p>",
1049          {"%1": "para"},
1050          "", "",
1051          None),
1052     (u"list", frozenset([]), "plain"):
1053         (u"@list/plain",
1054          u"%1",
1055          {"%1": "list"},
1056          "\n", "",
1057          None),
1058     (u"list", frozenset([]), "rich"):
1059         (u"@list/rich",
1060          u"<ul>%1</ul>",
1061          {"%1": "list"},
1062          "", "",
1063          None),
1064     (u"item", frozenset([]), "plain"):
1065         (u"@item/plain",
1066          u"  * %1",
1067          {"%1": "item"},
1068          "", "\n",
1069          None),
1070     (u"item", frozenset([]), "rich"):
1071         (u"@item/rich",
1072          u"<li>%1</li>",
1073          {"%1": "item"},
1074          "", "",
1075          None),
1076     (u"note", frozenset([]), "plain"):
1077         (u"@note/plain",
1078          u"Note: %1",
1079          {"%1": "note"},
1080          "", "",
1081          None),
1082     (u"note", frozenset([]), "rich"):
1083         (u"@note/rich",
1084          u"<i>Note</i>: %1",
1085          {"%1": "note"},
1086          "", "",
1087          None),
1088     (u"note", frozenset([u"label"]), "plain"):
1089         (u"@note-with-label/plain\n"
1090          u"%1 is the note label, %2 is the text",
1091          u"%1: %2",
1092          {"%1": "label", "%2": "note"},
1093          "", "",
1094          None),
1095     (u"note", frozenset([u"label"]), "rich"):
1096         (u"@note-with-label/rich\n"
1097          u"%1 is the note label, %2 is the text",
1098          u"<i>%1</i>: %2",
1099          {"%1": "label", "%2": "note"},
1100          "", "",
1101          None),
1102     (u"warning", frozenset([]), "plain"):
1103         (u"@warning/plain",
1104          u"WARNING: %1",
1105          {"%1": "warning"},
1106          "", "",
1107          None),
1108     (u"warning", frozenset([]), "rich"):
1109         (u"@warning/rich",
1110          u"<b>Warning</b>: %1",
1111          {"%1": "warning"},
1112          "", "",
1113          None),
1114     (u"warning", frozenset([u"label"]), "plain"):
1115         (u"@warning-with-label/plain\n"
1116          u"%1 is the warning label, %2 is the text",
1117          u"%1: %2",
1118          {"%1": "label", "%2": "warning"},
1119          "", "",
1120          None),
1121     (u"warning", frozenset([u"label"]), "rich"):
1122         (u"@warning-with-label/rich\n"
1123          u"%1 is the warning label, %2 is the text",
1124          u"<b>%1</b>: %2",
1125          {"%1": "label", "%2": "warning"},
1126          "", "",
1127          None),
1128     (u"link", frozenset([]), "plain"):
1129         (u"@link/plain",
1130          u"%1",
1131          {"%1": "link"},
1132          "", "",
1133          None),
1134     (u"link", frozenset([]), "rich"):
1135         (u"@link/rich",
1136          u"<a href=\"%1\">%1</a>",
1137          {"%1": "link"},
1138          "", "",
1139          None),
1140     (u"link", frozenset([u"url"]), "plain"):
1141         (u"@link-with-description/plain\n"
1142          u"%1 is the URL, %2 is the descriptive text",
1143          u"%2 (%1)",
1144          {"%2": "link", "%1": "url"},
1145          "", "",
1146          None),
1147     (u"link", frozenset([u"url"]), "rich"):
1148         (u"@link-with-description/rich\n"
1149          u"%1 is the URL, %2 is the descriptive text",
1150          u"<a href=\"%1\">%2</a>",
1151          {"%2": "link", "%1": "url"},
1152          "", "",
1153          None),
1154     (u"filename", frozenset([]), "plain"):
1155         (u"@filename/plain",
1156          u"‘%1’",
1157          {"%1": "filename"},
1158          "", "",
1159          None),
1160     (u"filename", frozenset([]), "rich"):
1161         (u"@filename/rich",
1162          u"<tt>%1</tt>",
1163          {"%1": "filename"},
1164          "", "",
1165          None),
1166     (u"application", frozenset([]), "plain"):
1167         (u"@application/plain",
1168          u"%1",
1169          {"%1": "application"},
1170          "", "",
1171          None),
1172     (u"application", frozenset([]), "rich"):
1173         (u"@application/rich",
1174          u"%1",
1175          {"%1": "application"},
1176          "", "",
1177          None),
1178     (u"command", frozenset([]), "plain"):
1179         (u"@command/plain",
1180          u"%1",
1181          {"%1": "command"},
1182          "", "",
1183          None),
1184     (u"command", frozenset([]), "rich"):
1185         (u"@command/rich",
1186          u"<tt>%1</tt>",
1187          {"%1": "command"},
1188          "", "",
1189          None),
1190     (u"command", frozenset([u"section"]), "plain"):
1191         (u"@command-with-section/plain\n"
1192          u"%1 is the command name, %2 is its man section",
1193          u"%1(%2)",
1194          {"%1": "command", "%2": "section"},
1195          "", "",
1196          None),
1197     (u"command", frozenset([u"section"]), "rich"):
1198         (u"@command-with-section/rich\n"
1199          u"%1 is the command name, %2 is its man section",
1200          u"<tt>%1(%2)</tt>",
1201          {"%1": "command", "%2": "section"},
1202          "", "",
1203          None),
1204     (u"resource", frozenset([]), "plain"):
1205         (u"@resource/plain",
1206          u"“%1”",
1207          {"%1": "resource"},
1208          "", "",
1209          None),
1210     (u"resource", frozenset([]), "rich"):
1211         (u"@resource/rich",
1212          u"“%1”",
1213          {"%1": "resource"},
1214          "", "",
1215          None),
1216     (u"icode", frozenset([]), "plain"):
1217         (u"@icode/plain",
1218          u"“%1”",
1219          {"%1": "icode"},
1220          "", "",
1221          None),
1222     (u"icode", frozenset([]), "rich"):
1223         (u"@icode/rich",
1224          u"<tt>%1</tt>",
1225          {"%1": "icode"},
1226          "", "",
1227          None),
1228     (u"bcode", frozenset([]), "plain"):
1229         (u"@bcode/plain",
1230          u"\n%1\n",
1231          {"%1": "bcode"},
1232          "", "",
1233          None),
1234     (u"bcode", frozenset([]), "rich"):
1235         (u"@bcode/rich",
1236          u"<pre>%1</pre>",
1237          {"%1": "bcode"},
1238          "", "",
1239          None),
1240     (u"shortcut", frozenset([]), "plain"):
1241         (u"@shortcut/plain",
1242          u"%1",
1243          {"%1": "shortcut"},
1244          "", "",
1245          textmod_shortcut),
1246     (u"shortcut", frozenset([]), "rich"):
1247         (u"@shortcut/rich",
1248          u"<b>%1</b>",
1249          {"%1": "shortcut"},
1250          "", "",
1251          textmod_shortcut),
1252     (u"interface", frozenset([]), "plain"):
1253         (u"@interface/plain",
1254          u"|%1|",
1255          {"%1": "interface"},
1256          "", "",
1257          textmod_interface),
1258     (u"interface", frozenset([]), "rich"):
1259         (u"@interface/rich",
1260          u"<i>%1</i>",
1261          {"%1": "interface"},
1262          "", "",
1263          textmod_interface),
1264     (u"emphasis", frozenset([]), "plain"):
1265         (u"@emphasis/plain",
1266          u"*%1*",
1267          {"%1": "emphasis"},
1268          "", "",
1269          None),
1270     (u"emphasis", frozenset([]), "rich"):
1271         (u"@emphasis/rich",
1272          u"<i>%1</i>",
1273          {"%1": "emphasis"},
1274          "", "",
1275          None),
1276     (u"emphasis", frozenset([u"strong"]), "plain"):
1277         (u"@emphasis-strong/plain",
1278          u"**%1**",
1279          {"%1": "emphasis"},
1280          "", "",
1281          None),
1282     (u"emphasis", frozenset([u"strong"]), "rich"):
1283         (u"@emphasis-strong/rich",
1284          u"<b>%1</b>",
1285          {"%1": "emphasis"},
1286          "", "",
1287          None),
1288     (u"placeholder", frozenset([]), "plain"):
1289         (u"@placeholder/plain",
1290          u"&lt;%1&gt;",
1291          {"%1": "placeholder"},
1292          "", "",
1293          None),
1294     (u"placeholder", frozenset([]), "rich"):
1295         (u"@placeholder/rich",
1296          u"&lt;<i>%1</i>&gt;",
1297          {"%1": "placeholder"},
1298          "", "",
1299          None),
1300     (u"email", frozenset([]), "plain"):
1301         (u"@email/plain",
1302          u"&lt;%1&gt;",
1303          {"%1": "email"},
1304          "", "",
1305          None),
1306     (u"email", frozenset([]), "rich"):
1307         (u"@email/rich",
1308          u"&lt;<a href=\"mailto:%1\">%1</a>&gt;",
1309          {"%1": "email"},
1310          "", "",
1311          None),
1312     (u"email", frozenset([u"address"]), "plain"):
1313         (u"@email-with-name/plain\n"
1314          u"%1 is name, %2 is address",
1315          u"%1 &lt;%2&gt;",
1316          {"%1": "email", "%2": "address"},
1317          "", "",
1318          None),
1319     (u"email", frozenset([u"address"]), "rich"):
1320         (u"@email-with-name/rich\n"
1321          u"%1 is name, %2 is address",
1322          u"<a href=\"mailto:%2\">%1</a>",
1323          {"%1": "email", "%2": "address"},
1324          "", "",
1325          None),
1326     (u"envar", frozenset([]), "plain"):
1327         (u"@envar/plain",
1328          u"$%1",
1329          {"%1": "envar"},
1330          "", "",
1331          None),
1332     (u"envar", frozenset([]), "rich"):
1333         (u"@envar/rich",
1334          u"<tt>$%1</tt>",
1335          {"%1": "envar"},
1336          "", "",
1337          None),
1338     (u"message", frozenset([]), "plain"):
1339         (u"@message/plain",
1340          u"/%1/",
1341          {"%1": "message"},
1342          "", "",
1343          None),
1344     (u"message", frozenset([]), "rich"):
1345         (u"@message/rich",
1346          u"<i>%1</i>",
1347          {"%1": "message"},
1348          "", "",
1349          None),
1350     (u"nl", frozenset([]), "plain"):
1351         (u"@nl/plain",
1352          u"%1\n",
1353          {"%1": "nl"},
1354          "", "",
1355          None),
1356     (u"nl", frozenset([]), "rich"):
1357         (u"@nl/rich",
1358          u"%1<br/>",
1359          {"%1": "nl"},
1360          "", "",
1361          None),
1362 }
1363
1364 # Add patterns for term format, same as plain.
1365 for (tag, attrs, fmt), trspec in _kuit_transforms.items():
1366     if fmt == "plain":
1367         _kuit_transforms[(tag, attrs, "term")] = trspec
1368
1369 # Collect all known tags and formats.
1370 _kuit_tags = set()
1371 _known_formats = set()
1372 for (tag, attrs, fmt), trspec in _kuit_transforms.items():
1373     _kuit_tags.add(tag)
1374     _known_formats.add(fmt)
1375
1376 # Qt rich text tags (used for implicit determination of rich format).
1377 _html_tags = set([
1378     "a", "address", "b", "big", "blockquote", "body", "br",
1379     "center", "cita", "code", "dd", "dfn", "div", "dl", "dt", "em",
1380     "font", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html",
1381     "i", "img", "kbd", "meta", "li", "nobr", "ol", "p", "pre",
1382     "qt", "s", "samp", "small", "span", "strong", "sup", "sub",
1383     "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "tt",
1384     "u", "ul", "var",
1385 ])
1386
1387 # Default target formats by context marker.
1388 # According to kuitsemantics.cpp from kdecore.
1389 _cmarker_to_format = {
1390     "@action": "plain",
1391     "@title": "plain",
1392     "@label": "plain",
1393     "@option": "plain",
1394     "@item": "plain",
1395     "@info": "rich",
1396     "@info:progress": "plain",
1397     "@info:status": "plain",
1398     "@info:credit": "plain",
1399     "@info:shell": "plain",
1400 }
1401
1402 _top_tag_rx = re.compile(r"<\s*(qt|html)\b[^>]*>(.*)<\s*/\s*qt\s*>",
1403                          re.U | re.S | re.I)
1404
1405 def resolve_kuit (ctxt, text, quote, langdata, path, lno,
1406                   toxi18n=0, forcerich=False):
1407
1408     xi18n = False
1409
1410     fmt_cm, fmt_rc, res_ctxt, has_cmarker = format_from_cmarker(ctxt, quote)
1411     if forcerich:
1412         fmt_cm = "rich"
1413         fmt_rc = "rich"
1414     if fmt_cm and fmt_cm not in _known_formats:
1415         warning("%s:%d: Unknown format modifier '%s' in context marker. "
1416                 "The string will not be resolved until this is fixed."
1417                 % (path, lno, fmt_cm))
1418         has_any_html_tag = False
1419         has_any_kuit_tag = False
1420         return ctxt, text, xi18n, has_any_html_tag, has_any_kuit_tag
1421     if toxi18n in (1, 2) and fmt_cm != fmt_rc and not path.endswith(".po"):
1422         warning("%s:%d: Manual format modifier '%s' does not match "
1423                 "the implicit format modifier '%s' based on context marker. "
1424                 "Manual format modifiers are no longer supported, "
1425                 "replace them with another format selection method."
1426                 % (path, lno, fmt_cm, fmt_rc))
1427         # Recover original context with modifier still inside.
1428         res_ctxt = ctxt
1429     fmt = fmt_cm or format_from_tags(text, quote) or "plain"
1430
1431     ret = _resolve_kuit_r(text, quote, fmt, langdata, path, lno)
1432     res_text, has_any_kuit_tag, has_any_html_tag, has_top_tag = ret
1433
1434     if (toxi18n == 1 and has_any_kuit_tag) or toxi18n == 2:
1435         if has_any_html_tag:
1436             warning("%s:%d: Mixed KUIT and HTML tags. "
1437                     "This should be changed to all-KUIT tags."
1438                     % (path, lno))
1439         xi18n = True
1440         return res_ctxt, text, xi18n, has_any_html_tag, has_any_kuit_tag
1441
1442     if fmt_cm != "rich" and not has_any_html_tag:
1443         ret = resolve_entities(res_text, path, lno)
1444         res_text, any_entity_resolved = ret
1445     else:
1446         any_entity_resolved = False
1447
1448     if not has_cmarker and not has_any_kuit_tag and not any_entity_resolved:
1449         # In this case the resolution should have been no-op,
1450         # so return the original input just in case.
1451         return ctxt, text, xi18n, has_any_html_tag, has_any_kuit_tag
1452
1453     if has_top_tag or fmt_cm == "rich":
1454         # What to do with top tag in rich text.
1455         # 0 - As in KUIT processing in kdecore. But this would cause
1456         #     <html> tags to appear in otherwise plain text which happens
1457         #     to be sent to rich-text capable output. People may not like it.
1458         #     (It would also cause that running resolution over already
1459         #     resolved files leads to spurious additon of <html> tags,
1460         #     e.g. 1st resolution @info/plain -> @info and no <html> tag,
1461         #     2nd resolution @info -> @info and <html> tag.)
1462         # 1 - Original top tag is removed and then <html> tag added only if
1463         #     there is another tag or entity in the text.
1464         # 2 - Top tag is neither added nor removed, but left as it is
1465         #     in the literal text.
1466         top_tag_res = 2
1467         if top_tag_res in (0, 1):
1468             if has_top_tag:
1469                 res_text = _top_tag_rx.sub(r"\2", res_text)
1470             if top_tag_res == 0 or ("<" in res_text or "&" in res_text):
1471                 p1 = 0
1472                 p2 = len(res_text)
1473                 if quote:
1474                     p1 = res_text.find(quote) + len(quote)
1475                     p2 = res_text.rfind(quote)
1476                 res_text = ("%s<html>%s</html>%s"
1477                             % (res_text[:p1], res_text[p1:p2], res_text[p2:]))
1478         elif top_tag_res == 2:
1479             pass
1480         else:
1481             raise StandardError(
1482                 "Unknown top tag resolution choice '%d'." % top_tag_res)
1483
1484     return res_ctxt, res_text, xi18n, has_any_html_tag, has_any_kuit_tag
1485
1486
1487 _element_rx = re.compile(r"<\s*(\w+)(?:([^>]*)>(.*?)<\s*/\s*\1|\s*/)\s*>",
1488                          re.U | re.S)
1489 _attribute_rx = re.compile(r"""\b(\w+)\s*=\s*["'](.*?)["']""")
1490
1491 def _resolve_kuit_r (text, quote, fmt, langdata, path, lno):
1492
1493     segs = []
1494     p1 = 0
1495     has_any_kuit_tag = False
1496     has_any_html_tag = False
1497     has_top_tag = False
1498     while True:
1499         m = _element_rx.search(text, p1)
1500         if not m:
1501             segs.append(text[p1:])
1502             break
1503         p2, p3 = m.span()
1504         segs.append(text[p1:p2])
1505         tag, attrstr, etext = m.groups()
1506         if etext is None:
1507             in_place = True
1508             attrstr, etext = "", ""
1509         else:
1510             in_place = False
1511         ret = _resolve_kuit_r(etext, quote, fmt, langdata, path, lno)
1512         res_etext, has_any_kuit_tag_1, has_any_html_tag_1, has_top_tag_1 = ret
1513         has_any_html_tag = has_any_html_tag or has_any_html_tag_1
1514         has_any_kuit_tag = has_any_kuit_tag or has_any_kuit_tag_1
1515         res_span = text[p2:p3] # in case no other resolution
1516         if tag in _kuit_tags:
1517             has_any_kuit_tag = True
1518             attrmap = dict(_attribute_rx.findall(attrstr))
1519             has_top_tag = has_top_tag or has_top_tag_1
1520             trkey = (tag, frozenset(attrmap.keys()), fmt)
1521             tr = langdata.transform.get(trkey)
1522             if tr is not None:
1523                 if tr.textmodf:
1524                     res_etext = tr.textmodf(res_etext, quote, fmt, langdata)
1525                 res_span = tr.pattern
1526                 if quote:
1527                     res_span = escape_c(res_span, quote)
1528                 replmap = attrmap
1529                 replmap[tag] = res_etext
1530                 # Replace in one pass, because replacement might contain %N.
1531                 p1a = 0
1532                 csegs = []
1533                 seen_pls = set()
1534                 while True:
1535                     p2a = res_span.find("%", p1a)
1536                     if p2a < 0:
1537                         csegs.append(res_span[p1a:])
1538                         break
1539                     csegs.append(res_span[p1a:p2a])
1540                     if res_span[p2a + 1:p2a + 2].isdigit():
1541                         pl = res_span[p2a:p2a + 2]
1542                         nm = tr.subsmap[pl]
1543                         cseg = replmap[nm] # cannot fail
1544                         if quote and pl in seen_pls:
1545                             # If placeholder was already replaced once,
1546                             # further replacements have to eliminate
1547                             # masking chars and quotes, because
1548                             # total number of masking chars must not change.
1549                             cseg = join_quoted(cseg, quote,
1550                                                invert=True, strip=True)
1551                         seen_pls.add(pl)
1552                         csegs.append(cseg)
1553                         p1a = p2a + 2
1554                     else:
1555                         csegs.append("%")
1556                         p1a = p2a + 1
1557                 res_span = "".join(csegs)
1558                 res_span = tr.prepend + res_span + tr.postpend
1559             else:
1560                 warning("%s:%d: No transformation for tag '%s' and format '%s'."
1561                         % (path, lno, tag, fmt))
1562         elif tag == "numid":
1563             has_any_kuit_tag = True
1564             # What to do with numid tag.
1565             # 0 - Simply remove numid tag, with a warning to manually convert
1566             #     associated argument into digit string.
1567             # 1 - Modify all placeholders in the text wrapped with numid
1568             #     to %I<N> form, which indicates numeric identifier formatting.
1569             numid_tag_res = 0
1570             if numid_tag_res == 0:
1571                 if not path.endswith((".po", ".pot")):
1572                     warning("%s:%d: A '%s' tag has been removed, do something "
1573                             "manually with the affected argument "
1574                             "(e.g. wrap it in QString::number())."
1575                             % (path, lno, tag))
1576                 res_span = res_etext
1577             elif numid_tag_res == 1:
1578                 nisegs = []
1579                 p1b = 0
1580                 while True:
1581                     p2b = res_etext.find("%", p1b)
1582                     if p2b < 0:
1583                         nisegs.append(res_etext[p1b:])
1584                         break
1585                     nisegs.append(res_etext[p1b:p2b])
1586                     if res_etext[p2b + 1:p2b + 2].isdigit():
1587                         p3b = p2b + 1
1588                         while p3b < len(res_etext) and res_etext[p3b].isdigit():
1589                             p3b += 1
1590                         nisegs.append("%I" + res_etext[p2b + 1:p3b])
1591                         p1b = p3b
1592                     else:
1593                         nisegs.append("%")
1594                         p1b += 1
1595                 res_span = "".join(nisegs)
1596             else:
1597                 raise StandardError(
1598                     "Unknown '%s' tag resolution choice '%d'."
1599                     % ("numid", numid_tag_res))
1600         elif tag in _html_tags:
1601             has_any_html_tag = True
1602             if tag.lower() in ("qt", "html"):
1603                 has_top_tag = True
1604             if not in_place:
1605                 res_span = "<%s%s>%s</%s>" % (tag, attrstr, res_etext, tag)
1606         segs.append(res_span)
1607         p1 = p3
1608     res_text = "".join(segs)
1609     return res_text, has_any_kuit_tag, has_any_html_tag, has_top_tag
1610
1611
1612 _entity_rx = re.compile(r"&([a-z]+|#[0-9]+|#x[0-9a-fA-F]+);", re.U | re.S)
1613
1614 _xml_entities = {
1615     "lt": "<",
1616     "gt": ">",
1617     "amp": "&",
1618     "apos": "'",
1619     "quot": "\"",
1620 }
1621
1622 def resolve_entities (text, path, lno):
1623
1624     any_entity_resolved = False
1625     segs = []
1626     p1 = 0
1627     while True:
1628         m = _entity_rx.search(text, p1)
1629         if not m:
1630             segs.append(text[p1:])
1631             break
1632         p2, p3 = m.span()
1633         segs.append(text[p1:p2])
1634         span = text[p2:p3]
1635         ent = m.group(1)
1636         if ent.startswith("#"): # numeric character
1637             try:
1638                 if ent[1] == "x":
1639                     c = unichr(int(ent[2:], 16))
1640                 else:
1641                     c = unichr(int(ent[1:], 10))
1642             except:
1643                 warning("%s:%d: Invalid numeric XML entity '%s'."
1644                         % (path, lno, ent))
1645             segs.append(c)
1646             any_entity_resolved = True
1647         elif ent in _xml_entities:
1648             segs.append(_xml_entities[ent])
1649             any_entity_resolved = True
1650         else:
1651             # Don't warn, may be some HTML entity.
1652             segs.append(span)
1653         p1 = p3
1654     res_text = "".join(segs)
1655     return res_text, any_entity_resolved
1656
1657
1658 _cmarker_rx = re.compile(r"@(\w+):?(\w+)?/?(\w+)?", re.U | re.S)
1659
1660 def format_from_cmarker (ctxt, quote):
1661
1662     fmt = None
1663     fmt_rc = None
1664     res_ctxt = ctxt
1665     has_cmarker = False
1666     if ctxt is not None:
1667         p1 = 0
1668         if quote:
1669             p1 = ctxt.find(quote) + len(quote)
1670         m = _cmarker_rx.match(ctxt, p1)
1671         if m:
1672             has_cmarker = True
1673             role, cue, fmt = m.groups()
1674             if role and cue: # implicit format by role and cue
1675                 fmt_rc = _cmarker_to_format.get("@%s:%s" % (role, cue))
1676             if not fmt_rc: # implicit format by role alone
1677                 fmt_rc = _cmarker_to_format.get("@%s" % role)
1678             if fmt: # explicit format modifier
1679                 p2 = ctxt.find("/", p1)
1680                 res_ctxt = ctxt[:p2] + ctxt[p2 + 1 + len(fmt):]
1681             else:
1682                 fmt = fmt_rc
1683     return fmt, fmt_rc, res_ctxt, has_cmarker
1684
1685
1686 _opentag_rx = re.compile(r"<\s*(\w+)[^>]*>", re.U | re.S)
1687
1688 def format_from_tags (text, quote):
1689
1690     fmt = None
1691     for tag in _opentag_rx.findall(text):
1692         if tag in _html_tags:
1693             fmt = "rich"
1694             break
1695     return fmt
1696
1697
1698 def escape_c (text, quote):
1699
1700     text = text.replace("\\", "\\\\") # must be first
1701     if quote:
1702         text = text.replace(quote, "\\" + quote)
1703     text = text.replace("\t", "\\t")
1704     text = text.replace("\n", "\\n")
1705     return text
1706
1707
1708 def join_quoted (s, quote, invert=False, strip=False):
1709
1710     segs1 = []
1711     segs2 = []
1712     p1 = 0
1713     l = len(s)
1714     lq = len(quote)
1715     while True:
1716         p2 = find_esc(s, quote, "\\", p1)
1717         if p2 < 0:
1718             segs2.append(s[p1:])
1719             break
1720         segs2.append(s[p1:p2])
1721         p2 += len(quote)
1722         p3 = find_skip_esc(s, quote, "\\", p2)
1723         if p3 < 0:
1724             raise StandardError(
1725                 "Malformed concatenated string literal '%s'." % s)
1726         segs1.append(s[p2:p3])
1727         p1 = p3 + len(quote)
1728     js1 = "".join(segs1)
1729     js2 = "".join(segs2)
1730     js = js1 if not invert else js2
1731     if not strip:
1732         js = quote + js + quote
1733     return js
1734
1735
1736 def find_esc (s, f, e, p=0):
1737
1738     ls = len(s)
1739     le = len(e)
1740     while p < ls:
1741         if s.startswith(e, p):
1742             p += le + 1
1743         elif s.startswith(f, p):
1744             break
1745         else:
1746             p += 1
1747     if p >= ls:
1748         p = -1
1749     return p
1750
1751
1752 _xml_entities_escape_ordered = [
1753     ("&", "&amp;"), # must be first
1754     ("<", "&lt;"),
1755     (">", "&gt;"),
1756     ("\"", "&quot;"),
1757     ("'", "&apos;"),
1758 ]
1759 _xml_entities_unescape_ordered = [
1760     tuple(reversed(x)) for x in reversed(_xml_entities_escape_ordered)]
1761
1762 def unescape_xml (es, testnoesc=False):
1763
1764     s = es
1765     if testnoesc:
1766         noesc = set()
1767     for ent, val in _xml_entities_unescape_ordered:
1768         if testnoesc:
1769             p = s.find(val)
1770             if p >= 0 and not s.startswith(ent, p): # for & -> &amp;
1771                 noesc.add(ent)
1772         s = s.replace(ent, val)
1773     if testnoesc:
1774         return s, noesc
1775     else:
1776         return s
1777
1778 def escape_xml (s, noesc=None):
1779
1780     es = s
1781     for val, ent in _xml_entities_escape_ordered:
1782         if not noesc or ent not in noesc:
1783             es = es.replace(val, ent)
1784     return es
1785
1786
1787 if __name__ == "__main__":
1788     main()
1789