pology/pology/resolve.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Replace value-defining segments in text with their values.
0005
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009
0010 import difflib
0011 import os
0012 import re
0013
0014 from pology import PologyError, _, n_
0015 from pology.report import warning, format_item_list
0016
0017
0018 # Defult starting string of alternatives directives.
0019 DEFAULT_ALTHEAD = "~@"
0020
0021
0022 _entity_ref_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U)
0023
0024 def resolve_entities (text, entities, ignored=set(), srcname=None,
0025                       vfilter=None, undefrepl=None):
0026     """
0027     Replace XML entities in the text with their values.
0028
0029     Entity values are defined by the supplied dictionary of name-value pairs.
0030     Not all entities need to be replaced, some can be explicitly ignored.
0031     If an entity is neither defined nor ignored, a warning will be reported
0032     to standard output if C{srcname} is given.
0033
0034     An undefined entity is by default left untouched in the resulting text.
0035     Instead, the parameter C{undefrepl} can be used to supply a string to
0036     substitute for every undefined entity, or a function which takes
0037     the undefined entity name and returns the string to substitute.
0038
0039     @param text: the text to transform
0040     @type text: string
0041     @param entities: entity name-value pairs
0042     @type entities: has .get() with dict.get() semantics
0043     @param ignored: entities to ignore; a sequence of entity names,
0044         or function taking the entity name and returning C{True} if ignored
0045     @type ignored: a sequence or (string)->bool
0046     @param srcname: if not None, report unknown entities to standard output,
0047         with this parameter as source identifier
0048     @type srcname: None or string
0049     @param vfilter: format string (with single C{%s} directive) or function
0050         to apply to every resolved entity value
0051     @type vfilter: string or (string)->string
0052     @param undefrepl: string or function to use in case of undefined entity
0053     @type undefrepl: string of (string)->string
0054
0055     @returns: the resulting text, resolved entities names,
0056         and unknown entity names
0057     @rtype: (string, [string...], [string...])
0058     """
0059
0060     ignoredf = ignored if callable(ignored) else lambda x: x in ignored
0061
0062     unknown = []
0063     resolved = []
0064     segs = []
0065     p = 0
0066     while True:
0067         pp = p
0068         p = text.find("&", p)
0069         if p < 0:
0070             segs.append(text[pp:])
0071             break
0072
0073         segs.append(text[pp:p])
0074         m = _entity_ref_rx.match(text, p)
0075         if m:
0076             entref = m.group(0)
0077             entname = m.group(1)
0078             if not ignoredf(entname):
0079                 entval = entities.get(entname)
0080                 entvalr = entval
0081                 if entval is not None:
0082                     resolved.append(entname)
0083                 else:
0084                     unknown.append(entname)
0085                     if undefrepl is not None:
0086                         if isinstance(undefrepl, str):
0087                             entvalr = undefrepl
0088                         else:
0089                             entvalr = undefrepl(entname)
0090
0091                 if entvalr is not None:
0092                     if vfilter is not None:
0093                         if isinstance(vfilter, str):
0094                             entvalr = vfilter % entvalr
0095                         else:
0096                             entvalr = vfilter(entvalr)
0097                     # Recurse in case entity resolves into new entities.
0098                     res = resolve_entities(entvalr, entities, ignoredf,
0099                                            srcname, vfilter, undefrepl)
0100                     entvalr, resolved_extra, unknown_extra = res
0101                     resolved.extend(resolved_extra)
0102                     unknown.extend(unknown_extra)
0103                     segs.append(entvalr)
0104                 else:
0105                     segs.append(entref)
0106
0107                 if entval is None and srcname is not None:
0108                     # Try to suggest some near matches.
0109                     #nears = difflib.get_close_matches(entname, entities)
0110                     # FIXME: Too slow for a lot entities.
0111                     nears = []
0112                     if nears:
0113                         warning(_("@info",
0114                                   "%(file)s: Unknown entity '%(ent)s' "
0115                                   "(near matches: %(entlist)s).",
0116                                   file=srcname, ent=entname,
0117                                   entlist=format_item_list(nears)))
0118                     else:
0119                         warning(_("@info",
0120                                   "%(file)s: Unknown entity '%(ent)s'.",
0121                                   file=srcname, ent=entname))
0122             else:
0123                 segs.append(entref)
0124
0125             p += len(entref)
0126         else:
0127             segs.append("&")
0128             p += 1
0129
0130     new_text = type(text)("").join(segs)
0131
0132     return new_text, resolved, unknown
0133
0134
0135 def resolve_entities_simple (text, entities, ignored=set(),
0136                              srcname=None, vfilter=None):
0137     """
0138     As L{resolve_entities}, but returns only the resolved text.
0139
0140     @returns: the resulting text
0141     @rtype: string
0142
0143     @see: L{resolve_entities}
0144     """
0145
0146     return resolve_entities(text, entities, ignored,
0147                             srcname=srcname, vfilter=vfilter)[0]
0148
0149
0150 def resolve_alternatives (text, select, total, althead=DEFAULT_ALTHEAD,
0151                           altfilter=None, outfilter=None, condf=None,
0152                           srcname=None):
0153     """
0154     Replace alternatives directives in the text with the selected alternative.
0155
0156     Alternatives directives are of the form C{~@/.../.../...}, for example::
0157
0158         I see a ~@/pink/white/ elephant.
0159
0160     where C{~@} is the directive head, followed by a character that
0161     defines the delimiter of alternatives (like in C{sed} command).
0162     The number of alternatives per directive is not defined by the directive
0163     itself, but is provided as an external parameter.
0164
0165     Alternative directive is resolved into one of the alternative substrings
0166     by given index of the alternative (one-based).
0167     Before substituting the directive, the selected alternative can be filtered
0168     through function given by C{altfilter} parameter.
0169     Text outside of directives can be filtered as well, piece by piece,
0170     through the function given by C{outfilter} parameter.
0171
0172     If an alternatives directive is malformed (e.g. to little alternatives),
0173     it may be reported to standard output. Unless all encountered directives
0174     were well-formed, the original text is returned instead of the partially
0175     resolved one.
0176
0177     @param text: the text to transform
0178     @type text: string
0179     @param select: index of the alternative to select (one-based)
0180     @type select: int > 0
0181     @param total: number of alternatives per directive
0182     @type total: int > 0
0183     @param althead: directive head to use instead of the default one
0184     @type althead: string
0185     @param altfilter: filter to apply to chosen alternatives
0186     @type altfilter: (string) -> string
0187     @param outfilter: filter to apply to text outside of directives
0188     @type outfilter: (string) -> string
0189     @param condf:
0190         resolve current alternative directive only when this function
0191         returns C{True} on call with each alternative as argument
0192     @type condf: None or C{(x_1, ..., x_n) -> True/False}
0193     @param srcname:
0194         if not None, report malformed directives to standard output,
0195         with this string as source identifier
0196     @type srcname: None or string
0197     @returns:
0198         resulting text, number of resolved alternatives, and an indicator
0199         of well-formedness (C{True} if all directives well-formed)
0200     @rtype:
0201         string, int, bool
0202     """
0203
0204     alt_head = althead
0205     alt_hlen = len(althead)
0206
0207     if outfilter is None:
0208         outfilter = lambda x: x
0209     if altfilter is None:
0210         altfilter = lambda x: x
0211
0212     original_text = text
0213     new_text = ""
0214     nresolved = 0
0215     malformed = False
0216     p = -1
0217     while True:
0218         pp = p + 1
0219         p = text.find(alt_head, pp)
0220         if p < 0:
0221             new_text += outfilter(text[pp:])
0222             break
0223         ps = p
0224
0225         # Append segment prior to alternatives directive to the result.
0226         new_text += outfilter(text[pp:p])
0227         rep_text = text[p:] # text segment for error reporting
0228
0229         # Must have at least 2 characters after the head.
0230         if len(text) < p + alt_hlen + 2:
0231             malformed = True
0232             if srcname is not None:
0233                 warning(_("@info",
0234                           "%(file)s: Malformed alternatives directive "
0235                           "'...%(snippet)s'.",
0236                           file=srcname, snippet=rep_text))
0237             break
0238
0239         # Read the separating character.
0240         p += alt_hlen
0241         sep = text[p]
0242
0243         # Parse requested number of inserts,
0244         # choose the one with matching index for the result.
0245         alts = []
0246         for i in range(total):
0247             pp = p + 1
0248             p = text.find(sep, pp)
0249             # Must have exactly the given total number of alternatives.
0250             if p < 0:
0251                 malformed = True
0252                 if srcname is not None:
0253                     warning(_("@info",
0254                               "%(file)s: Too few alternatives in "
0255                               "the alternatives directive '...%(snippet)s'.",
0256                               file=srcname, snippet=rep_text))
0257                 break
0258             alts.append(text[pp:p])
0259         if malformed:
0260             break
0261
0262         # Replace the alternative if admissible, or leave directive untouched.
0263         isel = select - 1
0264         if isel < len(alts) and (not condf or condf(*alts)):
0265             new_text += altfilter(alts[isel])
0266             nresolved += 1
0267         else:
0268             new_text += text[ps:p+1]
0269
0270     if malformed:
0271         new_text = original_text
0272         nresolved = 0
0273
0274     return new_text, nresolved, not malformed
0275
0276
0277 def resolve_alternatives_simple (text, select, total, althead=DEFAULT_ALTHEAD,
0278                                  altfilter=None, outfilter=None, condf=None,
0279                                  srcname=None):
0280     """
0281     As L{resolve_alternatives}, but return only the resolved text.
0282
0283     @returns: the resulting text
0284     @rtype: string
0285
0286     @see: L{resolve_alternatives}
0287     """
0288
0289     res = resolve_alternatives(text, select, total, althead,
0290                                altfilter, outfilter, condf,
0291                                srcname)
0292     ntext, d1, valid = res
0293     if not valid:
0294         return text
0295     return ntext
0296
0297
0298 def first_to_case (text, upper=True, nalts=0, althead=DEFAULT_ALTHEAD):
0299     """
0300     Change case of the first letter in the text.
0301
0302     Text may also have alternatives directives (see L{resolve_alternatives}).
0303     In that case, if the first letter is found within an alternative, change
0304     cases for first letters in other alternatives of the same directive too.
0305
0306     If lowercasing is requested, it is not done if both the first and
0307     the second letter are uppercase (e.g. acronyms, all-caps writting).
0308
0309     @param text: the text to transform
0310     @type text: string
0311     @param upper: whether to transform to uppercase (lowercase otherwise)
0312     @type upper: bool
0313     @param nalts: if non-zero, the number of alternatives per directive
0314     @type nalts: int
0315     @param althead: alternatives directive head instead of the default one
0316     @type althead: string
0317
0318     @returns: the resulting text
0319     @rtype: string
0320
0321     @see: L{resolve_alternatives}
0322     """
0323
0324     alt_head = althead
0325     alt_hlen = len(althead)
0326
0327     tlen = len(text)
0328     remalts = 0
0329     checkcase = True
0330     intag = False
0331     ncchanged = 0
0332     textcc = ""
0333     i0 = 0
0334     i = 0
0335     while i < tlen:
0336         i0 = i
0337         c = text[i]
0338         cchange = False
0339
0340         if c == "<":
0341             # A markup tag is just starting.
0342             intag = True
0343
0344         elif c == ">":
0345             # A markup tag is just ending.
0346             intag = False
0347
0348         elif (    not intag
0349               and nalts and not remalts and text[i:i+alt_hlen] == alt_head):
0350             # An alternatives directive is just starting.
0351             i += 2
0352             if i >= tlen: # malformed directive, bail out
0353                 textcc = text
0354                 break
0355             # Record alternatives separator, set number of remaining
0356             # alternatives, reactivate case checking.
0357             altsep = text[i]
0358             remalts = nalts
0359             checkcase = True
0360
0361         elif not intag and remalts and c == altsep:
0362             # Alternative separator found, reduce number of remaining
0363             # alternatives and reactivate case checking.
0364             remalts -= 1
0365             checkcase = True
0366
0367         elif not intag and checkcase and c.isalpha():
0368             # Case check is active and the character is a letter;
0369             # request case change.
0370             cchange = True
0371             # No more case checks until next alternatives separator.
0372             checkcase = False
0373
0374         # Go to next character.
0375         i += 1
0376
0377         # Check if previous segment should be added with case change, or as is.
0378         cseg = text[i0:i]
0379         if cchange:
0380             ncchanged += 1
0381             if upper:
0382                 textcc += cseg.upper()
0383             else:
0384                 # Find first next letter, for two-uppercase check.
0385                 i1 = i
0386                 while i1 < tlen and not text[i1].isalpha():
0387                     i1 += 1
0388                 if i1 == tlen or not cseg.isupper() or not text[i1].isupper():
0389                     textcc += cseg.lower()
0390                 else:
0391                     textcc += cseg
0392         else:
0393             textcc += cseg
0394
0395         # If any letter has been upcased and there are no more alternatives
0396         # to be processed, we're done.
0397         if ncchanged > 0 and remalts == 0:
0398             textcc += text[i:]
0399             break
0400
0401     return textcc
0402
0403
0404 def first_to_upper (text, nalts=0, althead=DEFAULT_ALTHEAD):
0405     """
0406     Uppercase the first letter in the text.
0407
0408     A shortcut for L{first_to_case} for uppercasing.
0409
0410     @see: L{first_to_case}
0411     """
0412
0413     return first_to_case(text, upper=True, nalts=nalts, althead=althead)
0414
0415
0416 def first_to_lower (text, nalts=0, althead=DEFAULT_ALTHEAD):
0417     """
0418     Lowercase the first letter in the text.
0419
0420     A shortcut for L{first_to_case} for lowercasing.
0421
0422     @see: L{first_to_case}
0423     """
0424
0425     return first_to_case(text, upper=False, nalts=nalts, althead=althead)
0426
0427
0428 def expand_vars (text, varmap, head="%"):
0429     """
0430     Expand variables in the text.
0431
0432     Expansion directives start with a directive head (C{head} parameter),
0433     followed by variable name consisting of alphanumeric characters and
0434     underscores, and ending by any other character.
0435     Variable name may also be explicitly delimited within braces.
0436     Variable values for substitution are looked up by name in
0437     the C{varmap} dictionary; if not found, C{NameError} is raised.
0438
0439     Some examples::
0440
0441         expand_vars("Mary had a little %mammal.", {"mammal":"lamb"})
0442         expand_vars("Quite a %{critic}esque play.", {"critic":"burl"})
0443         expand_vars("Lost in single ~A.", {"A":"parenthesis"}, "~")
0444
0445     Dictionary values are filtered as C{"%s" % value} prior to substitution.
0446     Directive head may be escaped by repeating it twice in a row.
0447
0448     @param text: string to expand
0449     @type text: string
0450
0451     @param varmap: mapping of variable names to values
0452     @type varmap: (name, value) dictionary
0453
0454     @param head: opening sequence for expansion directive
0455     @type head: string
0456     """
0457
0458     p = 0
0459     hlen = len(head)
0460     tlen = len(text)
0461     ntext = []
0462     while p < tlen:
0463         pp = p
0464         p = text.find(head, pp)
0465         if p < 0:
0466             ntext.append(text[pp:])
0467             break
0468         ntext.append(text[pp:p])
0469         p += hlen
0470         if p < tlen and text[p:p+hlen] == head: # escaped
0471             ntext.append(head)
0472             p += hlen
0473             continue
0474         if p == tlen:
0475             raise PologyError(
0476                 _("@info",
0477                   "Empty variable expansion directive "
0478                   "at column %(col)d in string '%(str)s'.",
0479                   col=(p - hlen), str=text))
0480         braced = False
0481         if text[p] == "{":
0482             braced = True
0483             p += 1
0484         pp = p
0485         while p < tlen:
0486             c = text[p]
0487             if (   (not braced and not (c.isalnum() or c == "_"))
0488                 or (braced and c == "}")
0489             ):
0490                 break
0491             p += 1
0492         if braced and p == tlen:
0493             raise PologyError(
0494                 _("@info",
0495                   "Unclosed variable expansion directive "
0496                   "at column %(col)d in string '%(str)s'.",
0497                   col=(pp - 1 - hlen), str=text))
0498         varname = text[pp:p]
0499         if braced:
0500             p += 1
0501
0502         varvalue = varmap.get(varname)
0503         if varvalue is None:
0504             raise PologyError(
0505                 _("@info",
0506                   "Unknown variable '%(var)s' in variable expansion directive "
0507                   "at column %(col)d in string '%(str)s'.",
0508                   var=varname, col=pp, str=text))
0509         ntext.append("%s" % varvalue)
0510
0511     return type(text)("").join(ntext)
0512
0513
0514 _usual_accels = list("_&~^")
0515
0516 def remove_accelerator (text, accels=None, greedy=False):
0517     """
0518     Remove accelerator from the text.
0519
0520     Accelerator markers are characters which determine which letter in
0521     the text will be used as keyboard accelerator in user interface.
0522     They are usually a single non-alphanumeric character,
0523     and inserted before the letter which should be the accelerator,
0524     e.g. C{"Foo &Bar"}, C{"Foo _Bar"}, etc.
0525     Sometimes, especially in CJK texts, accelerator letter is separated out
0526     in parenthesis, at the start or end of the text, such as C{"Foo Bar (&B)"}.
0527
0528     This function will try to remove the accelerator in a smart way.
0529     E.g. it will ignore ampersand in C{"Foo & Bar"}, and completely
0530     remove a CJK-style accelerator.
0531
0532     If C{accels} is C{None}, the behavior depends on the value of C{greedy}.
0533     If it is C{False}, text is removed as is. If it is C{True}, some usual
0534     accelerator markers are considered: C{_}, C{&}, C{~}, and C{^}.
0535
0536     @param text: text to clear of the accelerator
0537     @type text: string
0538     @param accels: possible accelerator markers
0539     @type accels: sequence of strings or C{None}
0540     @param greedy: whether to try known markers if C{accels} is C{None}
0541     @type greedy: bool
0542
0543     @returns: text without the accelerator
0544     @rtype: string
0545     """
0546
0547     if accels is None:
0548         if not greedy:
0549             return text
0550         else:
0551             accels = _usual_accels
0552
0553     for accel in accels:
0554         alen = len(accel)
0555         p = 0
0556         while True:
0557             p = text.find(accel, p)
0558             if p < 0:
0559                 break
0560
0561             if text[p + alen:p + alen + 1].isalnum():
0562                 # If the accelerator marker is &, do not remove it if it
0563                 # looks like an XML entity (less damage than otherwise).
0564                 if accel == "&":
0565                     m = _entity_ref_rx.match(text, p)
0566                     if m:
0567                         p = m.span()[1]
0568                         continue
0569
0570                 # Valid accelerator.
0571                 text = text[:p] + text[p + alen:]
0572
0573                 # May have been an accelerator in style of
0574                 # "(<marker><alnum>)" at the start or end of text.
0575                 if (text[p - 1:p] == "(" and text[p + 1:p + 2] == ")"):
0576                     # Check if at start or end, ignoring non-alphanumerics.
0577                     tlen = len(text)
0578                     p1 = p - 2
0579                     while p1 >= 0 and not text[p1].isalnum():
0580                         p1 -= 1
0581                     p1 += 1
0582                     p2 = p + 2
0583                     while p2 < tlen and not text[p2].isalnum():
0584                         p2 += 1
0585                     p2 -= 1
0586                     if p1 == 0:
0587                         text = text[:p - 1].lstrip() + text[p2 + 1:]
0588                     elif p2 + 1 == tlen:
0589                         text = text[:p1] + text[p + 2:].rstrip()
0590
0591                 # Do not break, remove all accelerator markers,
0592                 # as it is indeterminate which one is the real one.
0593
0594             if text[p + alen:p + 2 * alen] == accel:
0595                 # Escaped accelerator marker.
0596                 text = text[:p] + text[p + alen:]
0597
0598             p += alen
0599
0600     return text
0601
0602
0603 def remove_fmtdirs (text, format, subs=""):
0604     """
0605     Remove format directives from the text.
0606
0607     Format directives are used to substitute values in the text.
0608     An example text with directives in several formats::
0609
0610         "%d men on a %s man's chest."  # C
0611         "%(num)d men on a %(attrib)s man's chest."  # Python
0612         "%1 men on a %2 man's chest." # KDE/Qt
0613
0614     Format is specified by a string keyword. The following formats are
0615     known at the moment: C{c}, C{qt}, c{kde}, c{python}.
0616     Format string may also have C{-format} appended to the keyword, for
0617     compatibility with Gettext format flags.
0618
0619     @param text: text from which to remove format directives
0620     @type text: string
0621     @param format: format keyword
0622     @type format: string
0623     @param subs: text to replace format directives instead of just removing it
0624     @type subs: string
0625
0626     @returns: text without format directives
0627     @rtype: string
0628     """
0629
0630     format = format.lower()
0631     if format.endswith("-format"):
0632         format = format[:format.rfind("-")]
0633
0634     if 0: pass
0635     elif format == "c":
0636         text = _remove_fmtdirs_c(text, subs)
0637     elif format in ("kde", "qt"):
0638         # FIXME: Actually, there are some differences between the two.
0639         text = _remove_fmtdirs_qt(text, subs)
0640     elif format == "python":
0641         text = _remove_fmtdirs_python(text, subs) # must be first
0642         text = _remove_fmtdirs_c(text, subs)
0643
0644     return text
0645
0646
0647 #_fmtdir_tail_c = r"[ +-]?\d*\.?\d*[a-z]"
0648 # A conversion specifier begins with the % character. After the % character come the following in this order:
0649 # [flags]    Control the conversion (optional).
0650 # [width]    Defines the number of characters to print (optional).
0651 # [.precision]   Defines the amount of precision to print for a number type (optional).
0652 # [modifier]     Overrides the size (type) of the argument (optional).
0653 # [type]     The type of conversion to be applied (required).
0654 # from http://www.acm.uiuc.edu/webmonkeys/book/c_guide/2.12.html#printf
0655
0656 _fmtdir_tail_c = r"[ +-0]?(\d+|\*)?(\.(\d+|\*))?[hlL]?[cdieEfgGosuxXpn%]"
0657 _fmtdir_tail_c_rx = re.compile(_fmtdir_tail_c)
0658
0659 def _remove_fmtdirs_c (text, subs=""):
0660
0661     p = 0
0662     nsegs = []
0663     while True:
0664         pp = p
0665         p = text.find("%", p)
0666         if p < 0:
0667             nsegs.append(text[pp:])
0668             break
0669         nsegs.append(text[pp:p])
0670         p += 1
0671         if text[p:p+1] == "%":
0672             nsegs.append("%")
0673             p += 1
0674             continue
0675         m = _fmtdir_tail_c_rx.match(text, p)
0676         if m:
0677             p = m.span()[1]
0678             if subs:
0679                 nsegs.append(subs)
0680
0681     return type(text)("").join(nsegs)
0682
0683
0684 _fmtdir_tail_python_rx = re.compile(r"(\(.*?\))?" + _fmtdir_tail_c)
0685
0686 def _remove_fmtdirs_python (text, subs=""):
0687
0688     p = 0
0689     nsegs = []
0690     while True:
0691         pp = p
0692         p = text.find("%", p)
0693         if p < 0:
0694             nsegs.append(text[pp:])
0695             break
0696         nsegs.append(text[pp:p])
0697         p += 1
0698         if text[p:p+1] == "%":
0699             nsegs.append("%")
0700             p += 1
0701             continue
0702         m = _fmtdir_tail_python_rx.match(text, p)
0703         if m:
0704             p = m.span()[1]
0705             if subs:
0706                 nsegs.append(subs)
0707
0708     return type(text)("").join(nsegs)
0709
0710
0711 _fmtdir_tail_qt_rx = re.compile(r"L?\d{1,2}")
0712
0713 def _remove_fmtdirs_qt (text, subs=""):
0714
0715     p = 0
0716     nsegs = []
0717     while True:
0718         pp = p
0719         p = text.find("%", p)
0720         if p < 0:
0721             nsegs.append(text[pp:])
0722             break
0723         nsegs.append(text[pp:p])
0724         p += 1
0725         m = _fmtdir_tail_qt_rx.match(text, p)
0726         if m:
0727             p = m.span()[1]
0728             if subs:
0729                 nsegs.append(subs)
0730         else:
0731             nsegs.append("%")
0732
0733     return type(text)("").join(nsegs)
0734
0735
0736 def remove_literals (text, subs="", substrs=[], regexes=[], heuristic=True):
0737     """
0738     Remove literal substrings from the text.
0739
0740     Literal substrings are URLs, email addresses, web site names,
0741     command options, etc. This function will heuristically try to
0742     remove such substrings from the text.
0743
0744     Additional literals to remove may be specified as verbatim substrings
0745     (C{substrs} parameter) and regular expressions (C{regexes}).
0746     These are applied before the internal heuristic matchers.
0747     Heuristic removal may be entirely disabled by setting C{heuristic}
0748     to C{False}.
0749
0750     @param text: text from which to remove literals
0751     @type text: string
0752     @param subs: text to replace literals instead of just removing them
0753     @type subs: string
0754     @param substrs: additional substrings to remove by direct string match
0755     @type substrs: sequence of strings
0756     @param regexes: additional substrings to remove by regex match
0757     @type regexes: sequence of compiled regular expressions
0758     @param heuristic: whether to apply heuristic at all
0759     @type heuristic: bool
0760
0761     @returns: text without literals
0762     @rtype: string
0763     """
0764
0765     # Apply explicit literals before heuristics.
0766     for substr in substrs:
0767         text = text.replace(substr, subs)
0768     for regex in regexes:
0769         text = regex.sub(subs, text)
0770
0771     if heuristic:
0772         text = _remove_literals_url(text, subs)
0773         text = _remove_literals_email(text, subs)
0774         text = _remove_literals_web(text, subs) # after URLs and email
0775         text = _remove_literals_cmd(text, subs)
0776         text = _remove_literals_file(text, subs)
0777
0778     return text
0779
0780
0781 def _remove_by_rx (text, rx, subs=""):
0782
0783     p = 0
0784     nsegs = []
0785     while True:
0786         m = rx.search(text, p)
0787         if not m:
0788             nsegs.append(text[p:])
0789             break
0790         p1, p2 = m.span()
0791         nsegs.append(text[p:p1])
0792         if subs:
0793             nsegs.append(subs)
0794         p = p2
0795
0796     return type(text)("").join(nsegs)
0797
0798
0799 _literal_url_rx = re.compile(r"\S+://\S*[\w\d&=]", re.U)
0800
0801 def _remove_literals_url (text, subs=""):
0802
0803     return _remove_by_rx(text, _literal_url_rx, subs)
0804
0805
0806 _literal_web_rx = re.compile(r"\w[\w-]{2,}(\.[\w-]{2,})+", re.U)
0807
0808 def _remove_literals_web (text, subs=""):
0809
0810     return _remove_by_rx(text, _literal_web_rx, subs)
0811
0812
0813 _literal_email_rx = re.compile(r"\w[\w.-]*@\w+\.[\w.-]*\w")
0814
0815 def _remove_literals_email (text, subs=""):
0816
0817     return _remove_by_rx(text, _literal_email_rx, subs)
0818
0819
0820 _literal_cmd_rx = re.compile(r"[a-z\d_-]+\(\d\)", re.I)
0821 _literal_cmdopt_rx = re.compile(r"(?<!\S)-[a-z\d]+", re.I)
0822 _literal_cmdoptlong_rx = re.compile(r"(?<!\S)--[a-z\d-]+", re.I)
0823
0824 def _remove_literals_cmd (text, subs=""):
0825
0826     text = _remove_by_rx(text, _literal_cmd_rx, subs)
0827     text = _remove_by_rx(text, _literal_cmdopt_rx, subs)
0828     text = _remove_by_rx(text, _literal_cmdoptlong_rx, subs)
0829     return text
0830
0831
0832 _literal_filehome_rx = re.compile(r"~(/[\w.-]+)+/?", re.I|re.U)
0833 _literal_fileext_rx = re.compile(r"\*(\.[a-z\d]+){1,2}", re.I)
0834
0835 def _remove_literals_file (text, subs=""):
0836
0837     text = _remove_by_rx(text, _literal_filehome_rx, subs)
0838     text = _remove_by_rx(text, _literal_fileext_rx, subs)
0839     return text
0840
0841
0842 def convert_plurals (mapping, plhead):
0843     """
0844     Convert plural forms in the catalog [hook factory].
0845
0846     @param mapping: The source to destination mapping of form indices.
0847         This is a list of tuples of source (before modification)
0848         to destination (after modification) indices.
0849         There must be no gaps in the destination indices,
0850         i.e. all indices from 0 up to maximum given destination index
0851         must exist in the mapping.
0852     @type mapping: [(int, int)*]
0853
0854     @param plhead: The plural header value.
0855     @type plhead: string
0856
0857     @return: type F5A hook
0858     @rtype: C{(cat) -> numerr}
0859     """
0860
0861     dst_inds = list(map(set, list(zip(*mapping))))[1]
0862     num_plurals = max(dst_inds) + 1
0863     if sorted(dst_inds) != list(range(num_plurals)):
0864         raise PologyError(
0865             _("@info",
0866               "Gaps in destination indices for conversion of plural forms "
0867               "(expected (%(list1)s), got (%(list2)s)).",
0868               list1=format_item_list(list(range(num_plurals))),
0869               list2=format_item_list(sorted(dst_inds))))
0870
0871     ord_src_inds = list(zip(*sorted(mapping, key=lambda x: x[1])))[0]
0872     def hook (cat):
0873         cat.header.set_field("Plural-Forms", str(plhead),
0874                              after="Content-Transfer-Encoding")
0875         for msg in cat:
0876             if msg.msgid_plural is not None:
0877                 msg.msgstr[:] = [msg.msgstr[i] for i in ord_src_inds]
0878
0879         return 0
0880
0881     return hook
0882
0883