File indexing completed on 2024-12-01 13:47:51
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Replace value-defining segments in text with their values. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 import difflib 0011 import os 0012 import re 0013 0014 from pology import PologyError, _, n_ 0015 from pology.report import warning, format_item_list 0016 0017 0018 # Defult starting string of alternatives directives. 0019 DEFAULT_ALTHEAD = "~@" 0020 0021 0022 _entity_ref_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U) 0023 0024 def resolve_entities (text, entities, ignored=set(), srcname=None, 0025 vfilter=None, undefrepl=None): 0026 """ 0027 Replace XML entities in the text with their values. 0028 0029 Entity values are defined by the supplied dictionary of name-value pairs. 0030 Not all entities need to be replaced, some can be explicitly ignored. 0031 If an entity is neither defined nor ignored, a warning will be reported 0032 to standard output if C{srcname} is given. 0033 0034 An undefined entity is by default left untouched in the resulting text. 0035 Instead, the parameter C{undefrepl} can be used to supply a string to 0036 substitute for every undefined entity, or a function which takes 0037 the undefined entity name and returns the string to substitute. 0038 0039 @param text: the text to transform 0040 @type text: string 0041 @param entities: entity name-value pairs 0042 @type entities: has .get() with dict.get() semantics 0043 @param ignored: entities to ignore; a sequence of entity names, 0044 or function taking the entity name and returning C{True} if ignored 0045 @type ignored: a sequence or (string)->bool 0046 @param srcname: if not None, report unknown entities to standard output, 0047 with this parameter as source identifier 0048 @type srcname: None or string 0049 @param vfilter: format string (with single C{%s} directive) or function 0050 to apply to every resolved entity value 0051 @type vfilter: string or (string)->string 0052 @param undefrepl: string or function to use in case of undefined entity 0053 @type undefrepl: string of (string)->string 0054 0055 @returns: the resulting text, resolved entities names, 0056 and unknown entity names 0057 @rtype: (string, [string...], [string...]) 0058 """ 0059 0060 ignoredf = ignored if callable(ignored) else lambda x: x in ignored 0061 0062 unknown = [] 0063 resolved = [] 0064 segs = [] 0065 p = 0 0066 while True: 0067 pp = p 0068 p = text.find("&", p) 0069 if p < 0: 0070 segs.append(text[pp:]) 0071 break 0072 0073 segs.append(text[pp:p]) 0074 m = _entity_ref_rx.match(text, p) 0075 if m: 0076 entref = m.group(0) 0077 entname = m.group(1) 0078 if not ignoredf(entname): 0079 entval = entities.get(entname) 0080 entvalr = entval 0081 if entval is not None: 0082 resolved.append(entname) 0083 else: 0084 unknown.append(entname) 0085 if undefrepl is not None: 0086 if isinstance(undefrepl, str): 0087 entvalr = undefrepl 0088 else: 0089 entvalr = undefrepl(entname) 0090 0091 if entvalr is not None: 0092 if vfilter is not None: 0093 if isinstance(vfilter, str): 0094 entvalr = vfilter % entvalr 0095 else: 0096 entvalr = vfilter(entvalr) 0097 # Recurse in case entity resolves into new entities. 0098 res = resolve_entities(entvalr, entities, ignoredf, 0099 srcname, vfilter, undefrepl) 0100 entvalr, resolved_extra, unknown_extra = res 0101 resolved.extend(resolved_extra) 0102 unknown.extend(unknown_extra) 0103 segs.append(entvalr) 0104 else: 0105 segs.append(entref) 0106 0107 if entval is None and srcname is not None: 0108 # Try to suggest some near matches. 0109 #nears = difflib.get_close_matches(entname, entities) 0110 # FIXME: Too slow for a lot entities. 0111 nears = [] 0112 if nears: 0113 warning(_("@info", 0114 "%(file)s: Unknown entity '%(ent)s' " 0115 "(near matches: %(entlist)s).", 0116 file=srcname, ent=entname, 0117 entlist=format_item_list(nears))) 0118 else: 0119 warning(_("@info", 0120 "%(file)s: Unknown entity '%(ent)s'.", 0121 file=srcname, ent=entname)) 0122 else: 0123 segs.append(entref) 0124 0125 p += len(entref) 0126 else: 0127 segs.append("&") 0128 p += 1 0129 0130 new_text = type(text)("").join(segs) 0131 0132 return new_text, resolved, unknown 0133 0134 0135 def resolve_entities_simple (text, entities, ignored=set(), 0136 srcname=None, vfilter=None): 0137 """ 0138 As L{resolve_entities}, but returns only the resolved text. 0139 0140 @returns: the resulting text 0141 @rtype: string 0142 0143 @see: L{resolve_entities} 0144 """ 0145 0146 return resolve_entities(text, entities, ignored, 0147 srcname=srcname, vfilter=vfilter)[0] 0148 0149 0150 def resolve_alternatives (text, select, total, althead=DEFAULT_ALTHEAD, 0151 altfilter=None, outfilter=None, condf=None, 0152 srcname=None): 0153 """ 0154 Replace alternatives directives in the text with the selected alternative. 0155 0156 Alternatives directives are of the form C{~@/.../.../...}, for example:: 0157 0158 I see a ~@/pink/white/ elephant. 0159 0160 where C{~@} is the directive head, followed by a character that 0161 defines the delimiter of alternatives (like in C{sed} command). 0162 The number of alternatives per directive is not defined by the directive 0163 itself, but is provided as an external parameter. 0164 0165 Alternative directive is resolved into one of the alternative substrings 0166 by given index of the alternative (one-based). 0167 Before substituting the directive, the selected alternative can be filtered 0168 through function given by C{altfilter} parameter. 0169 Text outside of directives can be filtered as well, piece by piece, 0170 through the function given by C{outfilter} parameter. 0171 0172 If an alternatives directive is malformed (e.g. to little alternatives), 0173 it may be reported to standard output. Unless all encountered directives 0174 were well-formed, the original text is returned instead of the partially 0175 resolved one. 0176 0177 @param text: the text to transform 0178 @type text: string 0179 @param select: index of the alternative to select (one-based) 0180 @type select: int > 0 0181 @param total: number of alternatives per directive 0182 @type total: int > 0 0183 @param althead: directive head to use instead of the default one 0184 @type althead: string 0185 @param altfilter: filter to apply to chosen alternatives 0186 @type altfilter: (string) -> string 0187 @param outfilter: filter to apply to text outside of directives 0188 @type outfilter: (string) -> string 0189 @param condf: 0190 resolve current alternative directive only when this function 0191 returns C{True} on call with each alternative as argument 0192 @type condf: None or C{(x_1, ..., x_n) -> True/False} 0193 @param srcname: 0194 if not None, report malformed directives to standard output, 0195 with this string as source identifier 0196 @type srcname: None or string 0197 @returns: 0198 resulting text, number of resolved alternatives, and an indicator 0199 of well-formedness (C{True} if all directives well-formed) 0200 @rtype: 0201 string, int, bool 0202 """ 0203 0204 alt_head = althead 0205 alt_hlen = len(althead) 0206 0207 if outfilter is None: 0208 outfilter = lambda x: x 0209 if altfilter is None: 0210 altfilter = lambda x: x 0211 0212 original_text = text 0213 new_text = "" 0214 nresolved = 0 0215 malformed = False 0216 p = -1 0217 while True: 0218 pp = p + 1 0219 p = text.find(alt_head, pp) 0220 if p < 0: 0221 new_text += outfilter(text[pp:]) 0222 break 0223 ps = p 0224 0225 # Append segment prior to alternatives directive to the result. 0226 new_text += outfilter(text[pp:p]) 0227 rep_text = text[p:] # text segment for error reporting 0228 0229 # Must have at least 2 characters after the head. 0230 if len(text) < p + alt_hlen + 2: 0231 malformed = True 0232 if srcname is not None: 0233 warning(_("@info", 0234 "%(file)s: Malformed alternatives directive " 0235 "'...%(snippet)s'.", 0236 file=srcname, snippet=rep_text)) 0237 break 0238 0239 # Read the separating character. 0240 p += alt_hlen 0241 sep = text[p] 0242 0243 # Parse requested number of inserts, 0244 # choose the one with matching index for the result. 0245 alts = [] 0246 for i in range(total): 0247 pp = p + 1 0248 p = text.find(sep, pp) 0249 # Must have exactly the given total number of alternatives. 0250 if p < 0: 0251 malformed = True 0252 if srcname is not None: 0253 warning(_("@info", 0254 "%(file)s: Too few alternatives in " 0255 "the alternatives directive '...%(snippet)s'.", 0256 file=srcname, snippet=rep_text)) 0257 break 0258 alts.append(text[pp:p]) 0259 if malformed: 0260 break 0261 0262 # Replace the alternative if admissible, or leave directive untouched. 0263 isel = select - 1 0264 if isel < len(alts) and (not condf or condf(*alts)): 0265 new_text += altfilter(alts[isel]) 0266 nresolved += 1 0267 else: 0268 new_text += text[ps:p+1] 0269 0270 if malformed: 0271 new_text = original_text 0272 nresolved = 0 0273 0274 return new_text, nresolved, not malformed 0275 0276 0277 def resolve_alternatives_simple (text, select, total, althead=DEFAULT_ALTHEAD, 0278 altfilter=None, outfilter=None, condf=None, 0279 srcname=None): 0280 """ 0281 As L{resolve_alternatives}, but return only the resolved text. 0282 0283 @returns: the resulting text 0284 @rtype: string 0285 0286 @see: L{resolve_alternatives} 0287 """ 0288 0289 res = resolve_alternatives(text, select, total, althead, 0290 altfilter, outfilter, condf, 0291 srcname) 0292 ntext, d1, valid = res 0293 if not valid: 0294 return text 0295 return ntext 0296 0297 0298 def first_to_case (text, upper=True, nalts=0, althead=DEFAULT_ALTHEAD): 0299 """ 0300 Change case of the first letter in the text. 0301 0302 Text may also have alternatives directives (see L{resolve_alternatives}). 0303 In that case, if the first letter is found within an alternative, change 0304 cases for first letters in other alternatives of the same directive too. 0305 0306 If lowercasing is requested, it is not done if both the first and 0307 the second letter are uppercase (e.g. acronyms, all-caps writting). 0308 0309 @param text: the text to transform 0310 @type text: string 0311 @param upper: whether to transform to uppercase (lowercase otherwise) 0312 @type upper: bool 0313 @param nalts: if non-zero, the number of alternatives per directive 0314 @type nalts: int 0315 @param althead: alternatives directive head instead of the default one 0316 @type althead: string 0317 0318 @returns: the resulting text 0319 @rtype: string 0320 0321 @see: L{resolve_alternatives} 0322 """ 0323 0324 alt_head = althead 0325 alt_hlen = len(althead) 0326 0327 tlen = len(text) 0328 remalts = 0 0329 checkcase = True 0330 intag = False 0331 ncchanged = 0 0332 textcc = "" 0333 i0 = 0 0334 i = 0 0335 while i < tlen: 0336 i0 = i 0337 c = text[i] 0338 cchange = False 0339 0340 if c == "<": 0341 # A markup tag is just starting. 0342 intag = True 0343 0344 elif c == ">": 0345 # A markup tag is just ending. 0346 intag = False 0347 0348 elif ( not intag 0349 and nalts and not remalts and text[i:i+alt_hlen] == alt_head): 0350 # An alternatives directive is just starting. 0351 i += 2 0352 if i >= tlen: # malformed directive, bail out 0353 textcc = text 0354 break 0355 # Record alternatives separator, set number of remaining 0356 # alternatives, reactivate case checking. 0357 altsep = text[i] 0358 remalts = nalts 0359 checkcase = True 0360 0361 elif not intag and remalts and c == altsep: 0362 # Alternative separator found, reduce number of remaining 0363 # alternatives and reactivate case checking. 0364 remalts -= 1 0365 checkcase = True 0366 0367 elif not intag and checkcase and c.isalpha(): 0368 # Case check is active and the character is a letter; 0369 # request case change. 0370 cchange = True 0371 # No more case checks until next alternatives separator. 0372 checkcase = False 0373 0374 # Go to next character. 0375 i += 1 0376 0377 # Check if previous segment should be added with case change, or as is. 0378 cseg = text[i0:i] 0379 if cchange: 0380 ncchanged += 1 0381 if upper: 0382 textcc += cseg.upper() 0383 else: 0384 # Find first next letter, for two-uppercase check. 0385 i1 = i 0386 while i1 < tlen and not text[i1].isalpha(): 0387 i1 += 1 0388 if i1 == tlen or not cseg.isupper() or not text[i1].isupper(): 0389 textcc += cseg.lower() 0390 else: 0391 textcc += cseg 0392 else: 0393 textcc += cseg 0394 0395 # If any letter has been upcased and there are no more alternatives 0396 # to be processed, we're done. 0397 if ncchanged > 0 and remalts == 0: 0398 textcc += text[i:] 0399 break 0400 0401 return textcc 0402 0403 0404 def first_to_upper (text, nalts=0, althead=DEFAULT_ALTHEAD): 0405 """ 0406 Uppercase the first letter in the text. 0407 0408 A shortcut for L{first_to_case} for uppercasing. 0409 0410 @see: L{first_to_case} 0411 """ 0412 0413 return first_to_case(text, upper=True, nalts=nalts, althead=althead) 0414 0415 0416 def first_to_lower (text, nalts=0, althead=DEFAULT_ALTHEAD): 0417 """ 0418 Lowercase the first letter in the text. 0419 0420 A shortcut for L{first_to_case} for lowercasing. 0421 0422 @see: L{first_to_case} 0423 """ 0424 0425 return first_to_case(text, upper=False, nalts=nalts, althead=althead) 0426 0427 0428 def expand_vars (text, varmap, head="%"): 0429 """ 0430 Expand variables in the text. 0431 0432 Expansion directives start with a directive head (C{head} parameter), 0433 followed by variable name consisting of alphanumeric characters and 0434 underscores, and ending by any other character. 0435 Variable name may also be explicitly delimited within braces. 0436 Variable values for substitution are looked up by name in 0437 the C{varmap} dictionary; if not found, C{NameError} is raised. 0438 0439 Some examples:: 0440 0441 expand_vars("Mary had a little %mammal.", {"mammal":"lamb"}) 0442 expand_vars("Quite a %{critic}esque play.", {"critic":"burl"}) 0443 expand_vars("Lost in single ~A.", {"A":"parenthesis"}, "~") 0444 0445 Dictionary values are filtered as C{"%s" % value} prior to substitution. 0446 Directive head may be escaped by repeating it twice in a row. 0447 0448 @param text: string to expand 0449 @type text: string 0450 0451 @param varmap: mapping of variable names to values 0452 @type varmap: (name, value) dictionary 0453 0454 @param head: opening sequence for expansion directive 0455 @type head: string 0456 """ 0457 0458 p = 0 0459 hlen = len(head) 0460 tlen = len(text) 0461 ntext = [] 0462 while p < tlen: 0463 pp = p 0464 p = text.find(head, pp) 0465 if p < 0: 0466 ntext.append(text[pp:]) 0467 break 0468 ntext.append(text[pp:p]) 0469 p += hlen 0470 if p < tlen and text[p:p+hlen] == head: # escaped 0471 ntext.append(head) 0472 p += hlen 0473 continue 0474 if p == tlen: 0475 raise PologyError( 0476 _("@info", 0477 "Empty variable expansion directive " 0478 "at column %(col)d in string '%(str)s'.", 0479 col=(p - hlen), str=text)) 0480 braced = False 0481 if text[p] == "{": 0482 braced = True 0483 p += 1 0484 pp = p 0485 while p < tlen: 0486 c = text[p] 0487 if ( (not braced and not (c.isalnum() or c == "_")) 0488 or (braced and c == "}") 0489 ): 0490 break 0491 p += 1 0492 if braced and p == tlen: 0493 raise PologyError( 0494 _("@info", 0495 "Unclosed variable expansion directive " 0496 "at column %(col)d in string '%(str)s'.", 0497 col=(pp - 1 - hlen), str=text)) 0498 varname = text[pp:p] 0499 if braced: 0500 p += 1 0501 0502 varvalue = varmap.get(varname) 0503 if varvalue is None: 0504 raise PologyError( 0505 _("@info", 0506 "Unknown variable '%(var)s' in variable expansion directive " 0507 "at column %(col)d in string '%(str)s'.", 0508 var=varname, col=pp, str=text)) 0509 ntext.append("%s" % varvalue) 0510 0511 return type(text)("").join(ntext) 0512 0513 0514 _usual_accels = list("_&~^") 0515 0516 def remove_accelerator (text, accels=None, greedy=False): 0517 """ 0518 Remove accelerator from the text. 0519 0520 Accelerator markers are characters which determine which letter in 0521 the text will be used as keyboard accelerator in user interface. 0522 They are usually a single non-alphanumeric character, 0523 and inserted before the letter which should be the accelerator, 0524 e.g. C{"Foo &Bar"}, C{"Foo _Bar"}, etc. 0525 Sometimes, especially in CJK texts, accelerator letter is separated out 0526 in parenthesis, at the start or end of the text, such as C{"Foo Bar (&B)"}. 0527 0528 This function will try to remove the accelerator in a smart way. 0529 E.g. it will ignore ampersand in C{"Foo & Bar"}, and completely 0530 remove a CJK-style accelerator. 0531 0532 If C{accels} is C{None}, the behavior depends on the value of C{greedy}. 0533 If it is C{False}, text is removed as is. If it is C{True}, some usual 0534 accelerator markers are considered: C{_}, C{&}, C{~}, and C{^}. 0535 0536 @param text: text to clear of the accelerator 0537 @type text: string 0538 @param accels: possible accelerator markers 0539 @type accels: sequence of strings or C{None} 0540 @param greedy: whether to try known markers if C{accels} is C{None} 0541 @type greedy: bool 0542 0543 @returns: text without the accelerator 0544 @rtype: string 0545 """ 0546 0547 if accels is None: 0548 if not greedy: 0549 return text 0550 else: 0551 accels = _usual_accels 0552 0553 for accel in accels: 0554 alen = len(accel) 0555 p = 0 0556 while True: 0557 p = text.find(accel, p) 0558 if p < 0: 0559 break 0560 0561 if text[p + alen:p + alen + 1].isalnum(): 0562 # If the accelerator marker is &, do not remove it if it 0563 # looks like an XML entity (less damage than otherwise). 0564 if accel == "&": 0565 m = _entity_ref_rx.match(text, p) 0566 if m: 0567 p = m.span()[1] 0568 continue 0569 0570 # Valid accelerator. 0571 text = text[:p] + text[p + alen:] 0572 0573 # May have been an accelerator in style of 0574 # "(<marker><alnum>)" at the start or end of text. 0575 if (text[p - 1:p] == "(" and text[p + 1:p + 2] == ")"): 0576 # Check if at start or end, ignoring non-alphanumerics. 0577 tlen = len(text) 0578 p1 = p - 2 0579 while p1 >= 0 and not text[p1].isalnum(): 0580 p1 -= 1 0581 p1 += 1 0582 p2 = p + 2 0583 while p2 < tlen and not text[p2].isalnum(): 0584 p2 += 1 0585 p2 -= 1 0586 if p1 == 0: 0587 text = text[:p - 1].lstrip() + text[p2 + 1:] 0588 elif p2 + 1 == tlen: 0589 text = text[:p1] + text[p + 2:].rstrip() 0590 0591 # Do not break, remove all accelerator markers, 0592 # as it is indeterminate which one is the real one. 0593 0594 if text[p + alen:p + 2 * alen] == accel: 0595 # Escaped accelerator marker. 0596 text = text[:p] + text[p + alen:] 0597 0598 p += alen 0599 0600 return text 0601 0602 0603 def remove_fmtdirs (text, format, subs=""): 0604 """ 0605 Remove format directives from the text. 0606 0607 Format directives are used to substitute values in the text. 0608 An example text with directives in several formats:: 0609 0610 "%d men on a %s man's chest." # C 0611 "%(num)d men on a %(attrib)s man's chest." # Python 0612 "%1 men on a %2 man's chest." # KDE/Qt 0613 0614 Format is specified by a string keyword. The following formats are 0615 known at the moment: C{c}, C{qt}, c{kde}, c{python}. 0616 Format string may also have C{-format} appended to the keyword, for 0617 compatibility with Gettext format flags. 0618 0619 @param text: text from which to remove format directives 0620 @type text: string 0621 @param format: format keyword 0622 @type format: string 0623 @param subs: text to replace format directives instead of just removing it 0624 @type subs: string 0625 0626 @returns: text without format directives 0627 @rtype: string 0628 """ 0629 0630 format = format.lower() 0631 if format.endswith("-format"): 0632 format = format[:format.rfind("-")] 0633 0634 if 0: pass 0635 elif format == "c": 0636 text = _remove_fmtdirs_c(text, subs) 0637 elif format in ("kde", "qt"): 0638 # FIXME: Actually, there are some differences between the two. 0639 text = _remove_fmtdirs_qt(text, subs) 0640 elif format == "python": 0641 text = _remove_fmtdirs_python(text, subs) # must be first 0642 text = _remove_fmtdirs_c(text, subs) 0643 0644 return text 0645 0646 0647 #_fmtdir_tail_c = r"[ +-]?\d*\.?\d*[a-z]" 0648 # A conversion specifier begins with the % character. After the % character come the following in this order: 0649 # [flags] Control the conversion (optional). 0650 # [width] Defines the number of characters to print (optional). 0651 # [.precision] Defines the amount of precision to print for a number type (optional). 0652 # [modifier] Overrides the size (type) of the argument (optional). 0653 # [type] The type of conversion to be applied (required). 0654 # from http://www.acm.uiuc.edu/webmonkeys/book/c_guide/2.12.html#printf 0655 0656 _fmtdir_tail_c = r"[ +-0]?(\d+|\*)?(\.(\d+|\*))?[hlL]?[cdieEfgGosuxXpn%]" 0657 _fmtdir_tail_c_rx = re.compile(_fmtdir_tail_c) 0658 0659 def _remove_fmtdirs_c (text, subs=""): 0660 0661 p = 0 0662 nsegs = [] 0663 while True: 0664 pp = p 0665 p = text.find("%", p) 0666 if p < 0: 0667 nsegs.append(text[pp:]) 0668 break 0669 nsegs.append(text[pp:p]) 0670 p += 1 0671 if text[p:p+1] == "%": 0672 nsegs.append("%") 0673 p += 1 0674 continue 0675 m = _fmtdir_tail_c_rx.match(text, p) 0676 if m: 0677 p = m.span()[1] 0678 if subs: 0679 nsegs.append(subs) 0680 0681 return type(text)("").join(nsegs) 0682 0683 0684 _fmtdir_tail_python_rx = re.compile(r"(\(.*?\))?" + _fmtdir_tail_c) 0685 0686 def _remove_fmtdirs_python (text, subs=""): 0687 0688 p = 0 0689 nsegs = [] 0690 while True: 0691 pp = p 0692 p = text.find("%", p) 0693 if p < 0: 0694 nsegs.append(text[pp:]) 0695 break 0696 nsegs.append(text[pp:p]) 0697 p += 1 0698 if text[p:p+1] == "%": 0699 nsegs.append("%") 0700 p += 1 0701 continue 0702 m = _fmtdir_tail_python_rx.match(text, p) 0703 if m: 0704 p = m.span()[1] 0705 if subs: 0706 nsegs.append(subs) 0707 0708 return type(text)("").join(nsegs) 0709 0710 0711 _fmtdir_tail_qt_rx = re.compile(r"L?\d{1,2}") 0712 0713 def _remove_fmtdirs_qt (text, subs=""): 0714 0715 p = 0 0716 nsegs = [] 0717 while True: 0718 pp = p 0719 p = text.find("%", p) 0720 if p < 0: 0721 nsegs.append(text[pp:]) 0722 break 0723 nsegs.append(text[pp:p]) 0724 p += 1 0725 m = _fmtdir_tail_qt_rx.match(text, p) 0726 if m: 0727 p = m.span()[1] 0728 if subs: 0729 nsegs.append(subs) 0730 else: 0731 nsegs.append("%") 0732 0733 return type(text)("").join(nsegs) 0734 0735 0736 def remove_literals (text, subs="", substrs=[], regexes=[], heuristic=True): 0737 """ 0738 Remove literal substrings from the text. 0739 0740 Literal substrings are URLs, email addresses, web site names, 0741 command options, etc. This function will heuristically try to 0742 remove such substrings from the text. 0743 0744 Additional literals to remove may be specified as verbatim substrings 0745 (C{substrs} parameter) and regular expressions (C{regexes}). 0746 These are applied before the internal heuristic matchers. 0747 Heuristic removal may be entirely disabled by setting C{heuristic} 0748 to C{False}. 0749 0750 @param text: text from which to remove literals 0751 @type text: string 0752 @param subs: text to replace literals instead of just removing them 0753 @type subs: string 0754 @param substrs: additional substrings to remove by direct string match 0755 @type substrs: sequence of strings 0756 @param regexes: additional substrings to remove by regex match 0757 @type regexes: sequence of compiled regular expressions 0758 @param heuristic: whether to apply heuristic at all 0759 @type heuristic: bool 0760 0761 @returns: text without literals 0762 @rtype: string 0763 """ 0764 0765 # Apply explicit literals before heuristics. 0766 for substr in substrs: 0767 text = text.replace(substr, subs) 0768 for regex in regexes: 0769 text = regex.sub(subs, text) 0770 0771 if heuristic: 0772 text = _remove_literals_url(text, subs) 0773 text = _remove_literals_email(text, subs) 0774 text = _remove_literals_web(text, subs) # after URLs and email 0775 text = _remove_literals_cmd(text, subs) 0776 text = _remove_literals_file(text, subs) 0777 0778 return text 0779 0780 0781 def _remove_by_rx (text, rx, subs=""): 0782 0783 p = 0 0784 nsegs = [] 0785 while True: 0786 m = rx.search(text, p) 0787 if not m: 0788 nsegs.append(text[p:]) 0789 break 0790 p1, p2 = m.span() 0791 nsegs.append(text[p:p1]) 0792 if subs: 0793 nsegs.append(subs) 0794 p = p2 0795 0796 return type(text)("").join(nsegs) 0797 0798 0799 _literal_url_rx = re.compile(r"\S+://\S*[\w\d&=]", re.U) 0800 0801 def _remove_literals_url (text, subs=""): 0802 0803 return _remove_by_rx(text, _literal_url_rx, subs) 0804 0805 0806 _literal_web_rx = re.compile(r"\w[\w-]{2,}(\.[\w-]{2,})+", re.U) 0807 0808 def _remove_literals_web (text, subs=""): 0809 0810 return _remove_by_rx(text, _literal_web_rx, subs) 0811 0812 0813 _literal_email_rx = re.compile(r"\w[\w.-]*@\w+\.[\w.-]*\w") 0814 0815 def _remove_literals_email (text, subs=""): 0816 0817 return _remove_by_rx(text, _literal_email_rx, subs) 0818 0819 0820 _literal_cmd_rx = re.compile(r"[a-z\d_-]+\(\d\)", re.I) 0821 _literal_cmdopt_rx = re.compile(r"(?<!\S)-[a-z\d]+", re.I) 0822 _literal_cmdoptlong_rx = re.compile(r"(?<!\S)--[a-z\d-]+", re.I) 0823 0824 def _remove_literals_cmd (text, subs=""): 0825 0826 text = _remove_by_rx(text, _literal_cmd_rx, subs) 0827 text = _remove_by_rx(text, _literal_cmdopt_rx, subs) 0828 text = _remove_by_rx(text, _literal_cmdoptlong_rx, subs) 0829 return text 0830 0831 0832 _literal_filehome_rx = re.compile(r"~(/[\w.-]+)+/?", re.I|re.U) 0833 _literal_fileext_rx = re.compile(r"\*(\.[a-z\d]+){1,2}", re.I) 0834 0835 def _remove_literals_file (text, subs=""): 0836 0837 text = _remove_by_rx(text, _literal_filehome_rx, subs) 0838 text = _remove_by_rx(text, _literal_fileext_rx, subs) 0839 return text 0840 0841 0842 def convert_plurals (mapping, plhead): 0843 """ 0844 Convert plural forms in the catalog [hook factory]. 0845 0846 @param mapping: The source to destination mapping of form indices. 0847 This is a list of tuples of source (before modification) 0848 to destination (after modification) indices. 0849 There must be no gaps in the destination indices, 0850 i.e. all indices from 0 up to maximum given destination index 0851 must exist in the mapping. 0852 @type mapping: [(int, int)*] 0853 0854 @param plhead: The plural header value. 0855 @type plhead: string 0856 0857 @return: type F5A hook 0858 @rtype: C{(cat) -> numerr} 0859 """ 0860 0861 dst_inds = list(map(set, list(zip(*mapping))))[1] 0862 num_plurals = max(dst_inds) + 1 0863 if sorted(dst_inds) != list(range(num_plurals)): 0864 raise PologyError( 0865 _("@info", 0866 "Gaps in destination indices for conversion of plural forms " 0867 "(expected (%(list1)s), got (%(list2)s)).", 0868 list1=format_item_list(list(range(num_plurals))), 0869 list2=format_item_list(sorted(dst_inds)))) 0870 0871 ord_src_inds = list(zip(*sorted(mapping, key=lambda x: x[1])))[0] 0872 def hook (cat): 0873 cat.header.set_field("Plural-Forms", str(plhead), 0874 after="Content-Transfer-Encoding") 0875 for msg in cat: 0876 if msg.msgid_plural is not None: 0877 msg.msgstr[:] = [msg.msgstr[i] for i in ord_src_inds] 0878 0879 return 0 0880 0881 return hook 0882 0883