File indexing completed on 2025-04-20 08:14:28
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Convert and validate markup in text. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 import os 0011 import re 0012 import codecs 0013 import xml.parsers.expat 0014 import difflib 0015 0016 from pology import PologyError, datadir, _, n_ 0017 from pology.comments import manc_parse_flag_list 0018 from pology.diff import adapt_spans 0019 from pology.entities import read_entities 0020 from pology.getfunc import get_result_ireq 0021 from pology.msgreport import report_on_msg 0022 from pology.multi import Multidict 0023 from pology.report import format_item_list 0024 0025 0026 # Pipe flag used to manually prevent check for a particular message. 0027 flag_no_check_markup = "no-check-markup" 0028 0029 0030 _nlgr_rx = re.compile(r"\n{2,}") 0031 _wsgr_rx = re.compile(r"\s+", re.ASCII) 0032 0033 def plain_to_unwrapped (text): 0034 """ 0035 Convert wrapped plain text to unwrapped. 0036 0037 Two or more newlines are considered as paragraph boundaries and left in, 0038 while all other newlines are removed. 0039 Whitespace in the text is simplified throughout. 0040 0041 @param text: text to unwrap 0042 @type text: string 0043 0044 @returns: unwrapped text 0045 @rtype: string 0046 """ 0047 0048 # Strip leading and trailing whitespace. 0049 text = text.strip() 0050 0051 # Strip leading and trailing whitespace in all lines. 0052 text = "\n".join([x.strip() for x in text.split("\n")]) 0053 0054 # Mask all paragraph breaks. 0055 pbmask = "\x04\x04" 0056 text = _nlgr_rx.sub(pbmask, text) 0057 0058 # Replace all whitespace groups with single space. 0059 text = _wsgr_rx.sub(" ", text) 0060 0061 # Unmask paragraph breaks. 0062 text = text.replace(pbmask, "\n\n") 0063 0064 return text 0065 0066 0067 xml_entities = { 0068 "lt": "<", 0069 "gt": ">", 0070 "apos": "'", 0071 "quot": "\"", 0072 "amp": "&", 0073 } 0074 0075 WS_SPACE = "\x04~sp" 0076 WS_TAB = "\x04~tb" 0077 WS_NEWLINE = "\x04~nl" 0078 _ws_masks = { 0079 WS_SPACE: " ", 0080 WS_TAB: "\t", 0081 WS_NEWLINE: "\n", 0082 } 0083 _ws_unmasks = dict([(y, x) for x, y in list(_ws_masks.items())]) 0084 0085 def xml_to_plain (text, tags=None, subs={}, ents={}, keepws=set(), 0086 ignels=set()): 0087 """ 0088 Convert any XML-like markup to plain text. 0089 0090 By default, all tags in the text are replaced with a single space; 0091 entities, unless one of the XML default (C{<}, C{>}, C{&}, 0092 C{"}, C{'}), are left untouched; 0093 all whitespace groups are simplified to single space and leading and 0094 trailing removed. 0095 0096 If only a particular subset of tags should be taken into account, it can 0097 be specified by the C{tags} parameter, as a sequence of tag names 0098 (the sequence is internally converted to set before processing). 0099 0100 If a tag should be replaced with a special sequence of characters 0101 (either opening or closing tag), or the text wrapped by it replaced too, 0102 this can be specified by the C{subs} parameter. It is a dictionary of 0103 3-tuples by tag name, which tells what to replace with the opening tag, 0104 the closing tag, and the wrapped text. For example, to replace 0105 C{<i>foobar</i>} with C{/foobar/}, the dictionary entry would be 0106 C{{"i": ("/", "/", None)}} (where final C{None} states not to touch 0107 the wrapped text); to replace C{<code>...</code>} with C{@@@} 0108 (i.e. remove code segment completely but leave in a marker that there 0109 was something), the entry is C{{"code": ("", "", "@@@")}}. 0110 The replacement for the wrapped text can also be a function, 0111 taking a string and returning a string. 0112 Note that whitespace is automatically simplified, so if whitespace 0113 given by the replacements should be exactly preserved, use C{WS_*} 0114 string constants in place of corresponding whitespace characters. 0115 0116 To have some entities other than the XML default replaced with proper 0117 values, a dictionary of known entities with values may be provided using 0118 the C{ents} parameter. 0119 0120 Whitespace can be preserved within some elements, as given by 0121 their tags in the C{keepws} sequence. 0122 0123 Some elements may be completely removed, as given by the C{ignels} sequence. 0124 Each element of the sequence should either be a tag, or a (tag, type) tuple, 0125 where type is the value of the C{type} argument to element, if any. 0126 0127 It is assumed that the markup is well-formed, and if it is not 0128 the result is undefined; but best attempt at conversion is made. 0129 0130 There are several other functions in this module which deal with well known 0131 markups, such that it is not necessary to use this function with 0132 C{tags}, C{subs}, or C{ents} manually specified. 0133 0134 If you only want to resolve entities from a known set, instead of 0135 calling this function with empty C{tags} and entities given in C{ents}, 0136 consider using the more powerfull L{pology.resolve.resolve_entities}. 0137 0138 @param text: markup text to convert to plain 0139 @type text: string 0140 @param tags: known tags 0141 @type tags: sequence of strings 0142 @param subs: replacement specification 0143 @type subs: dictionary of 3-tuples 0144 @param ents: known entities and their values 0145 @type ents: dictionary 0146 @param keepws: tags of elements in which to preserve whitespace 0147 @type keepws: sequence of strings 0148 @param ignels: tags or tag/types or elements to completely remove 0149 @type ignels: sequence of strings and (string, string) tuples 0150 0151 @returns: plain text version 0152 @rtype: string 0153 """ 0154 0155 # Convert some sequences to sets, for faster membership checks. 0156 if tags is not None and not isinstance(tags, set): 0157 tags = set(tags) 0158 if not isinstance(keepws, set): 0159 keepws = set(keepws) 0160 if not isinstance(ignels, set): 0161 ignels = set(ignels) 0162 0163 # Resolve user-supplied entities before tags, 0164 # as they may contain more markup. 0165 # (Resolve default entities after tags, 0166 # because the default entities can introduce invalid markup.) 0167 text = _resolve_ents(text, ents, xml_entities) 0168 0169 # Build element tree, trying to work around badly formed XML 0170 # (but do note when the closing element is missing). 0171 # Element tree is constructed as list of tuples: 0172 # (tag, opening_tag_literal, closing_tag_literal, atype, content) 0173 # where atype is the value of type attribute (if any), 0174 # and content is a sublist for given element; 0175 # tag may be #text, when the content is string. 0176 eltree = [] 0177 curel = eltree 0178 parent = [] 0179 any_tag = False 0180 p = 0 0181 while True: 0182 pp = p 0183 p = text.find("<", p) 0184 if p < 0: 0185 break 0186 curel.append(("#text", None, None, None, text[pp:p])) 0187 tag_literal, tag, atype, opening, closing, p = _parse_tag(text, p) 0188 if p < 0: 0189 break 0190 if opening: # opening tag 0191 any_tag = True 0192 curel.append([tag, tag_literal, None, atype, []]) 0193 parent.append(curel) 0194 curel = curel[-1][-1] 0195 if closing: # closing tag (can be both opening and closing) 0196 if parent: 0197 curel = parent.pop() 0198 if not opening: 0199 # Record closing tag literal if not opening as well. 0200 curel[-1][2] = tag_literal 0201 else: # faulty markup, move top element 0202 eltree = [[tag, None, tag_literal, None, curel]] 0203 curel = eltree 0204 curel.append(("#text", None, None, None, text[pp:])) 0205 0206 # Replace tags. 0207 text = _resolve_tags(eltree, tags, subs, keepws, ignels) 0208 0209 # Resolve default entities. 0210 text = _resolve_ents(text, xml_entities) 0211 0212 return text 0213 0214 0215 def _parse_tag (text, p): 0216 # text[p] must be "<" 0217 0218 tag = "" 0219 atype = None 0220 opening = True 0221 closing = False 0222 0223 tlen = len(text) 0224 pp = p 0225 in_str = False 0226 in_tag = False 0227 in_attr = False 0228 in_lead = True 0229 in_afterslash = False 0230 in_aftereq = False 0231 in_aftertag = False 0232 in_afterattr = False 0233 ntag = "" 0234 nattr = "" 0235 while True: 0236 p += 1 0237 if p >= tlen: 0238 break 0239 0240 if in_lead and not text[p].isspace(): 0241 in_lead = False 0242 opening = text[p] != "/" 0243 if opening: 0244 in_tag = True 0245 p_tag = p 0246 else: 0247 in_afterslash = True 0248 elif in_afterslash and not text[p].isspace(): 0249 in_afterslash = False 0250 in_tag = True 0251 p_tag = p 0252 elif in_tag and (text[p].isspace() or text[p] in "/>"): 0253 in_tag = False 0254 in_aftertag = True 0255 tag = text[p_tag:p] 0256 ntag = tag.lower() 0257 elif in_aftertag and not (text[p].isspace() or text[p] in "/>"): 0258 in_aftertag = False 0259 in_attr = True 0260 p_attr = p 0261 elif in_attr and (text[p].isspace() or text[p] in "=/>"): 0262 in_attr = False 0263 if text[p] != "=": 0264 in_afterattr = True 0265 else: 0266 in_aftereq = True 0267 attr = text[p_attr:p] 0268 nattr = attr.lower() 0269 elif in_aftereq and text[p] in ('"', "'"): 0270 in_aftereq = False 0271 in_str = True 0272 quote_char = text[p] 0273 p_str = p + 1 0274 elif in_str and text[p] == quote_char: 0275 in_str = False 0276 s = text[p_str:p].strip().replace(" ", "") 0277 if nattr == "type": 0278 atype = s 0279 elif in_afterattr and text[p] == "=": 0280 in_afterattr = False 0281 in_aftereq = True 0282 0283 if not in_str and text[p] == "/": 0284 closing = True 0285 if not in_str and text[p] == ">": 0286 break 0287 0288 p += 1 0289 tag_literal = text[pp:p] 0290 0291 return tag_literal, tag, atype, opening, closing, p 0292 0293 0294 _entity_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U) 0295 0296 def _resolve_ents (text, ents={}, ignents={}): 0297 """ 0298 Resolve XML entities as described in L{xml_to_plain}, ignoring some. 0299 """ 0300 0301 # There may be entities within entities, so replace entities in each 0302 # entity value too before substituting in the main text. 0303 ntext = [] 0304 p = 0 0305 while True: 0306 pp = p 0307 p = text.find("&", p) 0308 if p < 0: 0309 break 0310 ntext.append(text[pp:p]) 0311 m = _entity_rx.match(text, p) 0312 if m: 0313 name = m.group(1) 0314 if name not in ignents: 0315 value = ents.get(name) 0316 if value is not None: 0317 # FIXME: Endless recursion if the entity repeats itself. 0318 value = _resolve_ents(value, ents, ignents) 0319 ntext.append(value) 0320 else: 0321 # Put entity back as-is. 0322 ntext.append(m.group(0)) 0323 else: # ignored entity, do not touch 0324 ntext.append(text[p:m.span()[1]]) 0325 p = m.span()[1] 0326 else: 0327 ntext.append(text[p]) # the ampersand 0328 p += 1 0329 ntext.append(text[pp:]) 0330 text = "".join(ntext) 0331 0332 return text 0333 0334 0335 # Ordinary around masked whitespace. 0336 _wsgr_premask_rx = re.compile(r"\s+(\x04~\w\w)") 0337 _wsgr_postmask_rx = re.compile(r"(\x04~\w\w)\s+") 0338 0339 def _resolve_tags (elseq, tags=None, subs={}, keepws=set(), ignels=set()): 0340 """ 0341 Replace XML tags as described in L{xml_to_plain}, given the parsed tree. 0342 Split into top and recursive part. 0343 """ 0344 0345 # Text with masked whitespace where significant. 0346 text = _resolve_tags_r(elseq, tags, subs, keepws, ignels) 0347 0348 # Simplify whitespace. 0349 text = _wsgr_rx.sub(" ", text) 0350 text = _wsgr_premask_rx.sub(r"\1", text) 0351 text = _wsgr_postmask_rx.sub(r"\1", text) 0352 text = text.strip() 0353 0354 # Unmask significant whitespace. 0355 text = _unmask_ws(text) 0356 0357 # Remove excess newlines even if supposedly significant. 0358 text = text.strip("\n") 0359 text = _nlgr_rx.sub("\n\n", text) 0360 0361 return text 0362 0363 0364 def _resolve_tags_r (elseq, tags=None, subs={}, keepws=set(), ignels=set()): 0365 0366 segs = [] 0367 for el in elseq: 0368 if el[0] in ignels or (el[0], el[3]) in ignels: 0369 # Complete element is ignored (by tag, or tag/type). 0370 continue 0371 0372 if el[0] == "#text": 0373 segs.append(el[-1]) 0374 elif tags is None or el[0] in tags: 0375 repl_pre, repl_post, repl_cont = subs.get(el[0], [" ", " ", None]) 0376 if repl_pre is None: 0377 repl_pre = "" 0378 if repl_post is None: 0379 repl_post = "" 0380 repl_cont_orig = repl_cont 0381 if not isinstance(repl_cont, str): 0382 repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels) 0383 if el[0] in keepws: 0384 # Mask whitespace in wrapped text. 0385 repl_cont = _mask_ws(repl_cont) 0386 if callable(repl_cont_orig): 0387 repl_cont = repl_cont_orig(repl_cont) 0388 # If space not significant, 0389 # find first non-whitespace characters in wrapped text 0390 # and shift them before surrounding replacements. 0391 if el[0] not in keepws: 0392 lcont = len(repl_cont) 0393 p1 = 0 0394 while p1 < lcont and repl_cont[p1].isspace(): 0395 p1 += 1 0396 p2 = lcont - 1 0397 while p2 > 0 and repl_cont[p2].isspace(): 0398 p2 -= 1 0399 repl_pre = repl_cont[:p1] + repl_pre 0400 repl_post = repl_post + repl_cont[p2+1:] 0401 repl_cont = repl_cont[p1:p2+1] 0402 segs.append(repl_pre + repl_cont + repl_post) 0403 else: 0404 # Ignored tag, put back verbatim. 0405 repl_pre = el[1] 0406 if repl_pre is None: 0407 repl_pre = "" 0408 repl_post = el[2] 0409 if repl_post is None: 0410 repl_post = "" 0411 repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels) 0412 segs.append(repl_pre + repl_cont + repl_post) 0413 0414 return "".join(segs) 0415 0416 0417 def _mask_ws (text): 0418 0419 for mask, ws in list(_ws_masks.items()): 0420 text = text.replace(ws, mask) 0421 return text 0422 0423 0424 def _unmask_ws (text): 0425 0426 for mask, ws in list(_ws_masks.items()): 0427 text = text.replace(mask, ws) 0428 return text 0429 0430 _html_tags = set(""" 0431 a address applet area b base basefont big blockquote body br button 0432 caption center cite code col colgroup dd del dfn dir div dl dt 0433 em fieldset font form frame frameset h1 h2 h3 h4 h5 h6 head hr html 0434 i iframe img input ins isindex kbd label legend li link map menu meta 0435 noframes noscript ol option p param pre 0436 s samp script select small span strike strong style sub sup 0437 table tbody td textarea tfoot th thead title tr tt u ul var xmp 0438 """.split()) 0439 _html_subs = { 0440 "_nows" : ("", "", None), 0441 "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None), 0442 } 0443 _html_subs.update([(x, _html_subs["_nows"]) for x in _html_tags]) 0444 _html_subs.update([(x, _html_subs["_parabr"]) for x in 0445 "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr" 0446 "".split()]) 0447 _html_ents = { # in addition to default XML entities 0448 "nbsp": "\xa0", 0449 } 0450 _html_keepws = set(""" 0451 code pre xmp 0452 """.split()) 0453 _html_ignels = set([ 0454 ("style", "text/css"), 0455 ]) 0456 0457 def html_to_plain (text): 0458 """ 0459 Convert HTML markup to plain text. 0460 0461 @param text: HTML text to convert to plain 0462 @type text: string 0463 0464 @returns: plain text version 0465 @rtype: string 0466 """ 0467 0468 return xml_to_plain(text, _html_tags, _html_subs, _html_ents, 0469 _html_keepws, _html_ignels) 0470 0471 0472 def html_plain (*args, **kwargs): 0473 """ 0474 Deprecated name for L{html_to_plain}. 0475 """ 0476 return html_to_plain(*args, **kwargs) 0477 0478 0479 _qtrich_tags = set(""" 0480 qt html 0481 a b big blockquote body br center cite code dd dl dt em font 0482 h1 h2 h3 h4 h5 h6 head hr i img li meta nobr ol p pre 0483 s span strong style sub sup table td th tr tt u ul var 0484 """.split()) 0485 _qtrich_subs = { 0486 "_nows" : ("", "", None), 0487 "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None), 0488 } 0489 _qtrich_subs.update([(x, _qtrich_subs["_nows"]) for x in _qtrich_tags]) 0490 _qtrich_subs.update([(x, _qtrich_subs["_parabr"]) for x in 0491 "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr" 0492 "".split()]) 0493 _qtrich_ents = { # in addition to default XML entities 0494 "nbsp": "\xa0", 0495 } 0496 _qtrich_keepws = set(""" 0497 code pre 0498 """.split()) 0499 _qtrich_ignels = set([ 0500 ("style", "text/css"), 0501 ]) 0502 0503 def qtrich_to_plain (text): 0504 """ 0505 Convert Qt rich-text markup to plain text. 0506 0507 @param text: Qt rich text to convert to plain 0508 @type text: string 0509 0510 @returns: plain text version 0511 @rtype: string 0512 """ 0513 0514 return xml_to_plain(text, _qtrich_tags, _qtrich_subs, _qtrich_ents, 0515 _qtrich_keepws, _qtrich_ignels) 0516 0517 0518 _kuit_tags = set(""" 0519 kuit kuil title subtitle para list item note warning 0520 filename link application command resource icode bcode shortcut interface 0521 emphasis placeholder email envar message numid nl 0522 """.split()) 0523 _kuit_subs = { 0524 "_nows" : ("", "", None), 0525 "_parabr" : ("", WS_NEWLINE*2, None), 0526 "_ws" : (" ", " ", None), 0527 "_ui" : ("[", "]", None), 0528 } 0529 _kuit_subs.update([(x, _kuit_subs["_nows"]) for x in _kuit_tags]) 0530 _kuit_subs.update([(x, _kuit_subs["_ws"]) for x in 0531 "placeholder".split()]) 0532 _kuit_subs.update([(x, _kuit_subs["_parabr"]) for x in 0533 "title subtitle para item nl" 0534 "".split()]) 0535 _kuit_subs.update([(x, _kuit_subs["_ui"]) for x in 0536 "interface".split()]) 0537 _kuit_ents = { # in addition to default XML entities 0538 } 0539 _kuit_keepws = set(""" 0540 icode bcode 0541 """.split()) 0542 _kuit_ignels = set([ 0543 ]) 0544 0545 def kuit_to_plain (text): 0546 """ 0547 Convert KUIT markup to plain text. 0548 0549 @param text: KUIT text to convert to plain 0550 @type text: string 0551 0552 @returns: plain text version 0553 @rtype: string 0554 """ 0555 0556 return xml_to_plain(text, _kuit_tags, _kuit_subs, _kuit_ents, 0557 _kuit_keepws, _kuit_ignels) 0558 0559 0560 _htkt_tags = set(list(_qtrich_tags) + list(_kuit_tags)) 0561 _htkt_subs = dict(list(_qtrich_subs.items()) + list(_kuit_subs.items())) 0562 _htkt_ents = dict(list(_qtrich_ents.items()) + list(_kuit_ents.items())) 0563 _htkt_keepws = set(list(_qtrich_keepws) + list(_kuit_keepws)) 0564 _htkt_ignels = set(list(_qtrich_ignels) + list(_kuit_ignels)) 0565 0566 def kde4_to_plain (text): 0567 """ 0568 Convert KDE4 GUI markup to plain text. 0569 0570 KDE4 GUI texts may contain both Qt rich-text and KUIT markup, 0571 even mixed in the same text. 0572 Note that the conversion cannot be achieved, in general, by first 0573 converting Qt rich-text, and then KUIT, or vice versa. 0574 For example, if the text has C{<} entity, after first conversion 0575 it will become plain C{<}, and interfere with second conversion. 0576 0577 @param text: KDE4 text to convert to plain 0578 @type text: string 0579 0580 @returns: plain text version 0581 @rtype: string 0582 """ 0583 0584 return xml_to_plain(text, _htkt_tags, _htkt_subs, _htkt_ents, 0585 _htkt_keepws, _htkt_ignels) 0586 0587 0588 # Assembled on first use. 0589 _dbk_tags = None 0590 _dbk_subs = None 0591 _dbk_ents = None 0592 _dbk_keepws = None 0593 _dbk_ignels = None 0594 0595 def _prep_docbook4_to_plain (): 0596 0597 global _dbk_tags, _dbk_subs, _dbk_ents, _dbk_keepws, _dbk_ignels 0598 0599 specpath = os.path.join(datadir(), "spec", "docbook4.l1") 0600 docbook4_l1 = collect_xml_spec_l1(specpath) 0601 _dbk_tags = set(docbook4_l1.keys()) 0602 0603 _dbk_subs = { 0604 "_nows" : ("", "", None), 0605 "_parabr" : ("", WS_NEWLINE*2, None), 0606 "_ws" : (" ", " ", None), 0607 "_ui" : ("[", "]", None), 0608 "_uipath" : ("", "", lambda s: re.sub(r"\]\s*\[", "->", s, re.U)), 0609 } 0610 _dbk_subs.update([(x, _dbk_subs["_nows"]) for x in _dbk_tags]) 0611 _dbk_subs.update([(x, _dbk_subs["_parabr"]) for x in 0612 "para title".split()]) # FIXME: Add more. 0613 _dbk_subs.update([(x, _dbk_subs["_ws"]) for x in 0614 "contrib address firstname placeholder surname " 0615 "primary secondary " 0616 "".split()]) 0617 _dbk_subs.update([(x, _dbk_subs["_ui"]) for x in 0618 "guilabel guibutton guiicon guimenu guisubmenu " 0619 "guimenuitem " 0620 "".split()]) 0621 _dbk_subs.update([(x, _dbk_subs["_uipath"]) for x in 0622 "menuchoice " 0623 "".split()]) 0624 0625 _dbk_ents = { # in addition to default XML entities 0626 } 0627 0628 _dbk_keepws = set(""" 0629 screen programlisting 0630 """.split()) # FIXME: Add more. 0631 0632 _dbk_ignels = set([ 0633 ]) 0634 0635 def docbook4_to_plain (text): 0636 """ 0637 Convert Docbook 4.x markup to plain text. 0638 0639 @param text: Docbook text to convert to plain 0640 @type text: string 0641 0642 @returns: plain text version 0643 @rtype: string 0644 """ 0645 0646 if _dbk_tags is None: 0647 _prep_docbook4_to_plain() 0648 0649 return xml_to_plain(text, _dbk_tags, _dbk_subs, _dbk_ents, 0650 _dbk_keepws, _dbk_ignels) 0651 0652 0653 def collect_xml_spec_l1 (specpath): 0654 """ 0655 Collect lightweight XML format specification, level 1. 0656 0657 Level 1 specification is the dictionary of all known tags, 0658 with allowed attributes and subtags for each. 0659 0660 File of the level 1 specification is in the following format:: 0661 0662 # A comment. 0663 # Tag with unconstrained attributes and subtags: 0664 tagA; 0665 # Tag with constrained attributes and unconstrained subtags: 0666 tagF : attr1 attr2 ...; 0667 # Tag with unconstrained attributes and constrained subtags: 0668 tagF > stag1 stag2 ...; 0669 # Tag with constrained attributes and subtags: 0670 tagF : attr1 attr2 ... > stag1 stag2 ...; 0671 # Tag with no attributes and unconstrained subtags: 0672 tagA :; 0673 # Tag with unconstrained attributes and no subtags: 0674 tagA >; 0675 # Tag with no attributes and no subtags: 0676 tagA :>; 0677 # Attribute value constrained by a regular expression: 0678 .... attr1=/^(val1|val2|val3)$/i ... 0679 # Reserved dummy tag specifying attributes common to all tags: 0680 pe-common-attrib : attrX attrY; 0681 0682 The specification can contain a dummy tag named C{pe-common-attrib}, 0683 stating attributes which are common to all tags, instead of having to 0684 list them with each and every tag. 0685 To make an attribute mandatory, it's name should be prefixed by 0686 exclamation sign (!). 0687 0688 Specification file must be UTF-8 encoded. 0689 0690 @param specpath: path to level 1 specification file 0691 @type specpath: string 0692 0693 @return: level 1 specification 0694 @rtype: dict 0695 """ 0696 0697 ch_comm = "#" 0698 ch_attr = ":" 0699 ch_attre = "=" 0700 ch_mattr = "!" 0701 ch_stag = ">" 0702 ch_end = ";" 0703 0704 dtag_attr = "pe-common-attrib" 0705 0706 valid_tag_rx = re.compile(r"^[\w-]+$") 0707 valid_attr_rx = re.compile(r"^[\w-]+$") 0708 0709 c_tag, c_attr, c_attre, c_stag = list(range(4)) 0710 0711 ifs = codecs.open(specpath, "r", "UTF-8").read() 0712 lenifs = len(ifs) 0713 0714 pos = [0, 1, 1] 0715 0716 def signal (msg, bpos): 0717 0718 emsg = _("@info \"L1-spec\" is shorthand for " 0719 "\"level 1 specification\"", 0720 "[L1-spec] %(file)s:%(line)d:%(col)d: %(msg)s", 0721 file=specpath, line=bpos[0], col=bpos[1], msg=msg) 0722 raise PologyError(emsg) 0723 0724 def advance (stoptest, cmnt=True): 0725 0726 ind = pos[0] 0727 oind = ind 0728 substr = [] 0729 sep = None 0730 while ind < lenifs and sep is None: 0731 if cmnt and ifs[ind] == ch_comm: 0732 ind = ifs.find("\n", ind) 0733 if ind < 0: 0734 break 0735 else: 0736 sep = stoptest(ind) 0737 if sep is None: 0738 substr.append(ifs[ind]) 0739 ind += 1 0740 else: 0741 ind += len(sep) 0742 0743 pos[0] = ind 0744 rawsubstr = ifs[oind:ind] 0745 p = rawsubstr.rfind("\n") 0746 if p >= 0: 0747 pos[1] += rawsubstr.count("\n") 0748 pos[2] = len(rawsubstr) - p 0749 else: 0750 pos[2] += len(rawsubstr) 0751 0752 return "".join(substr), sep 0753 0754 def make_rx_lint (rx_str, rx_flags, wch, lincol): 0755 try: 0756 rx = re.compile(rx_str, rx_flags) 0757 except: 0758 signal(_("@info the regex is already quoted when inserted", 0759 "Cannot compile regular expression %(regex)s.", 0760 regex=(wch + rx_str + wch)), 0761 lincol) 0762 return lambda x: rx.search(x) is not None 0763 0764 spec = {} 0765 ctx = c_tag 0766 entry = None 0767 while pos[0] < lenifs: 0768 if ctx == c_tag: 0769 t = lambda i: ( ifs[i] in (ch_attr, ch_stag, ch_end) 0770 and ifs[i] or None) 0771 tag, sep = advance(t) 0772 tag = tag.strip() 0773 if tag: 0774 if sep is None: 0775 signal(_("@info", 0776 "Entry not terminated after the initial tag."), 0777 lincol) 0778 if not valid_tag_rx.search(tag) and tag != dtag_attr: 0779 signal(_("@info", 0780 "Invalid tag name '%(tag)s'.", tag=tag), 0781 lincol) 0782 entry = _L1Element(tag) 0783 spec[tag] = entry 0784 0785 if sep == ch_attr: 0786 ctx = c_attr 0787 elif sep == ch_stag: 0788 ctx = c_stag 0789 elif sep == ch_end: 0790 ctx = c_tag 0791 else: 0792 break 0793 0794 elif ctx == c_attr: 0795 if entry.attrs is None: 0796 entry.attrs = set() 0797 0798 lincol = tuple(pos[1:]) 0799 t = lambda i: ( ( ifs[i].isspace() 0800 or ifs[i] in (ch_attre, ch_stag, ch_end)) 0801 and ifs[i] or [None])[0] 0802 attr, sep = advance(t) 0803 attr = attr.strip() 0804 if attr: 0805 if attr.startswith(ch_mattr): 0806 attr = attr[len(ch_mattr):] 0807 entry.mattrs.add(attr) 0808 if attr in entry.attrs: 0809 signal(_("@info", 0810 "Duplicate attribute '%(attr)s'.", attr=attr), 0811 lincol) 0812 if not valid_attr_rx.search(attr): 0813 signal(_("@info", 0814 "Invalid attribute name '%(attr)s'.", attr=attr), 0815 lincol) 0816 entry.attrs.add(attr) 0817 lastattr = attr 0818 0819 if sep.isspace(): 0820 ctx = c_attr 0821 elif sep == ch_attre: 0822 ctx = c_attre 0823 elif sep == ch_stag: 0824 ctx = c_stag 0825 elif sep == ch_end: 0826 ctx = c_tag 0827 else: 0828 signal(_("@info", 0829 "Entry not terminated after the attribute list."), 0830 lincol) 0831 0832 elif ctx == c_attre: 0833 lincol = tuple(pos[1:]) 0834 t = lambda i: not ifs[i].isspace() and ifs[i] or None 0835 sub, wch = advance(t) 0836 if wch is None: 0837 signal(_("@info", 0838 "End of input inside the value constraint."), 0839 lincol) 0840 t = lambda i: ifs[i] == wch and ifs[i] or None 0841 rx_str, sep = advance(t, cmnt=False) 0842 if sep is None: 0843 signal(_("@info", 0844 "End of input inside the value constraint."), 0845 lincol) 0846 t = lambda i: (not ifs[i].isalpha() and [""] or [None])[0] 0847 rx_flag_spec, sep = advance(t) 0848 rx_flags = re.U 0849 seen_flags = set() 0850 lincol = tuple(pos[1:]) 0851 for c in rx_flag_spec: 0852 if c in seen_flags: 0853 signal(_("@info", 0854 "Regex flag '%(flag)s' is already issued.", 0855 flag=c), lincol) 0856 if c == "i": 0857 rx_flags |= re.I 0858 else: 0859 signal(_("@info", 0860 "Unknown regex flag '%(flag)s'.", flag=c), 0861 lincol) 0862 seen_flags.add(c) 0863 entry.avlints[lastattr] = make_rx_lint(rx_str, rx_flags, 0864 wch, lincol) 0865 ctx = c_attr 0866 0867 elif ctx == c_stag: 0868 if entry.stags is None: 0869 entry.stags = set() 0870 0871 lincol = tuple(pos[1:]) 0872 t = lambda i: ( (ifs[i].isspace() or ifs[i] == ch_end) 0873 and ifs[i] or [None])[0] 0874 stag, sep = advance(t) 0875 stag = stag.strip() 0876 if stag: 0877 if stag in entry.stags: 0878 signal(_("@info", 0879 "Repeated subtag '%(tag)s'.", tag=stag), 0880 lincol) 0881 entry.stags.add(stag) 0882 0883 if sep == ch_end: 0884 ctx = c_tag 0885 else: 0886 signal(_("@info", 0887 "Entry not terminated after the subtag list."), 0888 lincol) 0889 0890 # Add common attributes to each tag. 0891 dentry_attr = spec.pop(dtag_attr, []) 0892 if dentry_attr: 0893 for attr in dentry_attr.attrs: 0894 attre = dentry_attr.avlints.get(attr) 0895 for entry in list(spec.values()): 0896 if entry.attrs is None: 0897 entry.attrs = set() 0898 if attr not in entry.attrs: 0899 entry.attrs.add(attr) 0900 if attre: 0901 entry.avlints[attr] = attre 0902 0903 return spec 0904 0905 0906 class _L1Element: 0907 0908 def __init__ (self, tag=None, attrs=None, mattrs=None, avlints=None, 0909 stags=None): 0910 0911 # The tag of this element (string). 0912 self.tag = tag 0913 # Possible attributes (set, or None meaning any). 0914 self.attrs = attrs 0915 # Mandatory attributes (set). 0916 self.mattrs = mattrs or set() 0917 # Validator functions for attribute values, per attribute (dict). 0918 # Validator does not have to be defined for each attribute. 0919 self.avlints = avlints or {} 0920 # Possible subelements by tag (set, or None meaning any). 0921 self.stags = stags 0922 0923 0924 # Simplified matching of XML entity name (sans ampersand and semicolon). 0925 _simple_ent_rx = re.compile(r"^([\w.:-]+|#[0-9]+)$", re.U); 0926 0927 # Get line/column segment in error report. 0928 _lin_col_rx = re.compile(r":\s*line\s*\d+,\s*column\s*\d+", re.I) 0929 0930 # Dummy top tag for topless texts. 0931 _dummy_top = "_" 0932 0933 0934 # Global data for XML checking. 0935 class _Global: pass 0936 _g_xml_l1 = _Global() 0937 0938 def validate_xml_l1 (text, spec=None, xmlfmt=None, ents=None, 0939 casesens=True, accelamp=False): 0940 """ 0941 Validate XML markup in text against L{level1<collect_xml_spec_l1>} 0942 specification. 0943 0944 Text is not required to have a top tag; if it does not, a dummy one will 0945 be assigned to assure that the check passes. 0946 0947 If C{spec} is C{None}, text is only checked to be well-formed. 0948 0949 If C{ents} are C{None}, entities in the text are ignored by the check; 0950 otherwise, an entity not belonging to the known set is considered erroneous. 0951 Default XML entities (C{<}, C{>}, C{&}, C{"}, C{'}) 0952 are automatically added to the set of known entities. 0953 0954 Tag and attribute names can be made case-insensitive by setting 0955 C{casesens} to C{False}. 0956 0957 If text is a part of user interface, and the environment may use 0958 the literal ampersand as accelerator marker, it can be allowed to pass 0959 the check by setting C{accelamp} to C{True}. 0960 0961 Text can be one or more entity definitions of the form C{<!ENTITY ...>}, 0962 when special check is applied. 0963 0964 The result of the check is list of erroneous spans in the text, 0965 each given by start and end index (in Python standard semantics), 0966 and the error description, packed in a tuple. 0967 If there are no errors, empty list is returned. 0968 Reported spans need not be formally complete with respect to the error 0969 location, but are heuristically determined to be short and 0970 provide good visual indication of what triggers the error. 0971 0972 @param text: text to check 0973 @type text: string 0974 @param spec: markup definition 0975 @type spec: L{level1<collect_xml_spec_l1>} specification 0976 @param xmlfmt: name of the particular XML format (for error messages) 0977 @type xmlfmt: string 0978 @param ents: set of known entities 0979 @type ents: sequence 0980 @param casesens: whether tag names are case-insensitive 0981 @type casesens: bool 0982 @param accelamp: whether to allow ampersand as accelerator marker 0983 @type accelamp: bool 0984 0985 @returns: erroneous spans in the text 0986 @rtype: list of (int, int, string) tuples 0987 """ 0988 0989 if text.lstrip().startswith("<!ENTITY"): 0990 return _validate_xml_entdef(text, xmlfmt) 0991 0992 # If ampersand accelerator marked allowed, replace one in non-entity 0993 # position with &, to let the parser proceed. 0994 text_orig = text 0995 if accelamp: 0996 text = _escape_amp_accel(text) 0997 0998 # Make sure the text has a top tag. 0999 text = "<%s>%s</%s>" % (_dummy_top, text, _dummy_top) 1000 1001 # Prepare parser. 1002 xenc = "UTF-8" 1003 parser = xml.parsers.expat.ParserCreate(xenc) 1004 parser.UseForeignDTD() # not to barf on non-default XML entities 1005 parser.StartElementHandler = _handler_start_element 1006 parser.DefaultHandler = _handler_default 1007 1008 # Link state for handlers. 1009 g = _g_xml_l1 1010 g.text = text 1011 g.spec = spec 1012 g.xmlfmt = xmlfmt or "XML" 1013 g.ents = ents 1014 g.casesens = casesens 1015 g.xenc = xenc 1016 g.parser = parser 1017 g.errcnt = 0 1018 g.spans = [] 1019 g.tagstack = [] 1020 1021 # Parse and check. 1022 try: 1023 parser.Parse(text.encode(xenc), True) 1024 except xml.parsers.expat.ExpatError as e: 1025 errmsg = _("@info a problem in the given type of markup " 1026 "(e.g. HTML, Docbook)", 1027 "%(mtype)s markup: %(snippet)s.", 1028 mtype=g.xmlfmt, snippet=e.args[0]) 1029 span = _make_span(text, e.lineno, e.offset, errmsg) 1030 g.spans.append(span) 1031 1032 # Adapt spans back to original text. 1033 pure_spans = [x[:2] for x in g.spans] 1034 pure_spans = adapt_spans(text_orig, text, pure_spans, merge=False) 1035 # Remove unhelpful line/column in error messages. 1036 errmsgs = [] 1037 for errmsg, span in zip([x[2] for x in g.spans], pure_spans): 1038 m = _lin_col_rx.search(errmsg) 1039 if m: 1040 errmsg = errmsg[:m.start()] + errmsg[m.end():] 1041 errmsgs.append(errmsg) 1042 # Put spans back together. 1043 g.spans = [x + (y,) for x, y in zip(pure_spans, errmsgs)] 1044 1045 return g.spans 1046 1047 1048 _ts_fence = "|/|" 1049 1050 def _escape_amp_accel (text): 1051 1052 p_ts = text.find(_ts_fence) 1053 in_script = False 1054 1055 p1 = 0 1056 found_accel = False 1057 while True: 1058 1059 # Bracket possible entity reference. 1060 p1 = text.find("&", p1) 1061 if p1 < 0: 1062 break 1063 if not in_script and p_ts >= 0 and p1 > p_ts: 1064 in_script = True 1065 found_accel = False 1066 p2 = text.find(";", p1) 1067 1068 # An accelerator marker if no semicolon in rest of the text 1069 # or the bracketed segment does not look like an entity, 1070 # and it is in front of an alphanumeric or itself. 1071 nc = text[p1 + 1:p1 + 2] 1072 if ( (p2 < 0 or not _simple_ent_rx.match(text[p1 + 1:p2])) 1073 and (nc.isalnum() or nc == "&") 1074 ): 1075 # Check if the next one is an ampersand too, 1076 # i.e. if it's a self-escaped accelerator marker. 1077 namp = 1 1078 if ( text[p1 + 1:p1 + 2] == "&" 1079 and not _simple_ent_rx.match(text[p1 + 2:p2]) 1080 ): 1081 namp += 1 1082 1083 # Escape the marker if first or self-escaped, 1084 # or currently in scripted part (in which there can be 1085 # any number of non-escaped markers). 1086 if not found_accel or namp > 1 or in_script: 1087 escseg = "&" * namp 1088 text = text[:p1] + escseg + text[p1 + namp:] 1089 p1 += len(escseg) 1090 if namp == 1: 1091 found_accel = True 1092 else: 1093 p1 += namp 1094 1095 elif p2 > p1: 1096 p1 = p2 1097 else: 1098 break 1099 1100 return text 1101 1102 1103 def _handler_start_element (tag, attrs): 1104 1105 g = _g_xml_l1 1106 1107 if g.spec is None: 1108 return 1109 1110 # Normalize names to lower case if allowed. 1111 if not g.casesens: 1112 tag = tag.lower() 1113 attrs = dict([(x.lower(), y) for x, y in list(attrs.items())]) 1114 1115 # Check existence of the tag. 1116 if tag not in g.spec and tag != _dummy_top: 1117 errmsg = _("@info", 1118 "%(mtype)s markup: unrecognized tag '%(tag)s'.", 1119 mtype=g.xmlfmt, tag=tag) 1120 span = _make_span(g.text, g.parser.CurrentLineNumber, 1121 g.parser.CurrentColumnNumber + 1, errmsg) 1122 g.spans.append(span) 1123 return 1124 1125 if tag == _dummy_top: 1126 return 1127 1128 elspec = g.spec[tag] 1129 errmsgs = [] 1130 1131 # Check applicability of attributes and validity of their values. 1132 if elspec.attrs is not None: 1133 for attr, aval in list(attrs.items()): 1134 if attr not in elspec.attrs: 1135 errmsgs.append(_("@info", 1136 "%(mtype)s markup: invalid attribute " 1137 "'%(attr)s' to tag '%(tag)s'.", 1138 mtype=g.xmlfmt, attr=attr, tag=tag)) 1139 else: 1140 avlint = elspec.avlints.get(attr) 1141 if avlint and not avlint(aval): 1142 errmsgs.append(_("@info", 1143 "%(mtype)s markup: invalid value " 1144 "'%(val)s' to attribute '%(attr)s'.", 1145 mtype=g.xmlfmt, val=aval, attr=attr)) 1146 1147 # Check presence of mandatory attributes. 1148 if elspec.mattrs is not None: 1149 for attr in elspec.mattrs: 1150 if attr not in attrs: 1151 errmsgs.append(_("@info", 1152 "%(mtype)s markup: missing mandatory attribute " 1153 "'%(attr)s' to tag '%(tag)s'.", 1154 mtype=g.xmlfmt, attr=attr, tag=tag)) 1155 1156 # Check proper parentage. 1157 if g.tagstack: 1158 ptag = g.tagstack[-1] 1159 pelspec = g.spec.get(ptag) 1160 if ( pelspec is not None and pelspec.stags is not None 1161 and tag not in pelspec.stags 1162 ): 1163 errmsgs.append(_("@info", 1164 "%(mtype)s markup: tag '%(tag1)s' cannot be " 1165 "a subtag of '%(tag2)s'.", 1166 mtype=g.xmlfmt, tag1=tag, tag2=ptag)) 1167 1168 # Record element stack. 1169 g.tagstack.append(tag) 1170 1171 for errmsg in errmsgs: 1172 span = _make_span(g.text, g.parser.CurrentLineNumber, 1173 g.parser.CurrentColumnNumber + 1, errmsg) 1174 g.spans.append(span) 1175 1176 1177 def _handler_default (text): 1178 1179 g = _g_xml_l1 1180 1181 if g.ents is not None and text.startswith('&') and text.endswith(';'): 1182 ent = text[1:-1] 1183 errmsg = None 1184 if ent.startswith("#"): 1185 if nument_to_char(ent) is None: 1186 errmsg = _("@info", 1187 "%(mtype)s markup: invalid numeric " 1188 "entity '%(ent)s'.", 1189 mtype=g.xmlfmt, ent=ent) 1190 elif ent not in g.ents and ent not in xml_entities: 1191 nearents = [] #difflib.get_close_matches(ent, g.ents) 1192 if nearents: 1193 if len(nearents) > 5: # do not overwhelm message 1194 fmtents = format_item_list(nearents[:5], incmp=True) 1195 else: 1196 fmtents = format_item_list(nearents) 1197 errmsg = _("@info", 1198 "%(mtype)s markup: unknown entity '%(ent)s' " 1199 "(suggestions: %(entlist)s).", 1200 mtype=g.xmlfmt, ent=ent, entlist=fmtents) 1201 else: 1202 errmsg = _("@info", 1203 "%(mtype)s markup: unknown entity '%(ent)s'.", 1204 mtype=g.xmlfmt, ent=ent) 1205 1206 if errmsg is not None: 1207 span = _make_span(g.text, g.parser.CurrentLineNumber, 1208 g.parser.CurrentColumnNumber + 1, errmsg) 1209 g.spans.append(span) 1210 1211 1212 # Text to fetch from the reported error position in XML stream. 1213 _near_xml_error_rx = re.compile(r"\W*[\w:.-]*[^\w\s>]*(\s*>)?", re.U) 1214 1215 def _make_span (text, lno, col, errmsg): 1216 1217 # Find problematic position. 1218 clno = 1 1219 p = 0 1220 while clno < lno: 1221 p = text.find("\n", p) 1222 if p < 0: 1223 break 1224 p += 1 1225 clno += 1 1226 if p < 0: 1227 return (0, len(text)) 1228 1229 # Scoop some reasonable nearby text. 1230 m = _near_xml_error_rx.match(text, p + col - 1) 1231 if not m: 1232 return (0, len(text), errmsg) 1233 start, end = m.span() 1234 while text[start].isalnum(): 1235 if start == 0: 1236 break 1237 start -= 1 1238 1239 return (start, end, errmsg) 1240 1241 1242 _entname_rx = re.compile(r"^([\w:][\w\d.:-]*)$", re.U) 1243 1244 def _validate_xml_entdef (text, xmlfmt): 1245 1246 state = "void" 1247 pos = 0 1248 tlen = len(text) 1249 errmsg = None 1250 dhead = "!ENTITY" 1251 def next_nws (pos): 1252 while pos < tlen and text[pos].isspace(): 1253 pos += 1 1254 return pos 1255 def next_ws (pos, ows=()): 1256 while pos < tlen and not text[pos].isspace() and text[pos] not in ows: 1257 pos += 1 1258 return pos 1259 errend = lambda: (_("@info", 1260 "%(mtype)s markup: premature end of entity definition.", 1261 mtype=xmlfmt), 1262 tlen) 1263 while True: 1264 if state == "void": 1265 pos = next_nws(pos) 1266 if pos == tlen: 1267 break 1268 elif text[pos] != "<": 1269 errmsg = _("@info", 1270 "%(mtype)s markup: expected opening angle bracket " 1271 "in entity definition.", 1272 mtype=xmlfmt) 1273 pos1 = pos + 1 1274 else: 1275 pos += 1 1276 state = "head" 1277 1278 elif state == "head": 1279 pos = next_nws(pos) 1280 if pos == tlen: 1281 errmsg, pos1 = errend() 1282 else: 1283 pos1 = next_ws(pos) 1284 head = text[pos:pos1] 1285 if head != dhead: 1286 errmsg = _("@info", 1287 "%(mtype)s markup: expected '%(keyword)s' " 1288 "in entity definition.", 1289 mtype=xmlfmt, keyword=dhead) 1290 else: 1291 pos = pos1 1292 state = "name" 1293 1294 elif state == "name": 1295 pos = next_nws(pos) 1296 pos1 = next_ws(pos, ("'", "\"")) 1297 name = text[pos:pos1] 1298 if not _entname_rx.match(name): 1299 errmsg = _("@info", 1300 "%(mtype)s markup: invalid entity name '%(name)s' " 1301 "in entity definition.", 1302 mtype=xmlfmt, name=name) 1303 else: 1304 pos = pos1 1305 state = "value" 1306 1307 elif state == "value": 1308 pos = next_nws(pos) 1309 if pos == tlen: 1310 errmsg, pos1 = errend() 1311 elif text[pos] not in ("'", "\""): 1312 errmsg = _("@info", 1313 "%(mtype)s markup: expected opening quote " 1314 "(ASCII single or double) in entity definition.", 1315 mtype=xmlfmt) 1316 pos1 = pos + 1 1317 else: 1318 quote = text[pos] 1319 pos1 = text.find(quote, pos + 1) 1320 if pos1 < 0: 1321 errmsg = _("@info", 1322 "%(mtype)s markup: unclosed entity value " 1323 "in entity definition.", 1324 mtype=xmlfmt) 1325 pos1 = tlen 1326 else: 1327 value = text[pos + 1:pos1] 1328 # FIXME: Validate value? Does not have to be valid 1329 # on its own, in principle. 1330 pos = pos1 + 1 1331 state = "tail" 1332 1333 elif state == "tail": 1334 pos = next_nws(pos) 1335 if pos == tlen: 1336 errmsg, pos1 = errend() 1337 elif text[pos] != ">": 1338 errmsg = _("@info", 1339 "%(mtype)s markup: expected closing angle bracket " 1340 "in entity definition.", 1341 mtype=xmlfmt) 1342 pos1 = pos + 1 1343 else: 1344 pos += 1 1345 state = "void" 1346 1347 if errmsg: 1348 break 1349 1350 spans = [] 1351 if errmsg: 1352 if pos1 is None: 1353 pos1 = pos 1354 spans = [(pos, pos1, errmsg)] 1355 1356 return spans 1357 1358 1359 def check_xml (strict=False, entities={}, mkeyw=None): 1360 """ 1361 Check general XML markup in translation [hook factory]. 1362 1363 Text is only checked to be well-formed XML, and possibly also whether 1364 encountered entities are defined. Markup errors are reported to stdout. 1365 1366 C{msgstr} can be either checked only if the C{msgid} is valid itself, 1367 or regardless of the validity of the original. This is governed by the 1368 C{strict} parameter. 1369 1370 Entities in addition to XML's default (C{<}, etc.) 1371 may be provided using the C{entities} parameter. 1372 Several types of values with different semantic are possible: 1373 - if C{entities} is C{None}, unknown entities are ignored on checking 1374 - if string, it is understood as a general function evaluation 1375 L{request<getfunc.get_result_ireq>}, 1376 and its result expected to be (name, value) dictionary-like object 1377 - otherwise, C{entities} is considered to be a (name, value) dictionary 1378 1379 If a message has L{sieve flag<pology.sieve.parse_sieve_flags>} 1380 C{no-check-markup}, the check is skipped for that message. 1381 If one or several markup keywords are given as C{mkeyw} parameter, 1382 check is skipped for all messages in a catalog which does not report 1383 one of the given keywords by its L{markup()<catalog.Catalog.markup>} 1384 method. See L{set_markup()<catalog.Catalog.set_markup>} for list of 1385 markup keywords recognized at the moment. 1386 1387 @param strict: whether to require valid C{msgstr} even if C{msgid} is not 1388 @type strict: bool 1389 @param entities: additional entities to consider as known 1390 @type entities: C{None}, dict, or string 1391 @param mkeyw: markup keywords for taking catalogs into account 1392 @type mkeyw: string or list of strings 1393 1394 @return: type S3C hook 1395 @rtype: C{(msgstr, msg, cat) -> numerr} 1396 """ 1397 1398 return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, False) 1399 1400 1401 def check_xml_sp (strict=False, entities={}, mkeyw=None): 1402 """ 1403 Like L{check_xml}, except that erroneous spans are returned 1404 instead of reporting problems to stdout [hook factory]. 1405 1406 @return: type V3C hook 1407 @rtype: C{(msgstr, msg, cat) -> spans} 1408 """ 1409 1410 return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, True) 1411 1412 1413 # Worker for C{check_xml*} hook factories. 1414 def _check_xml_w (check, strict, entities, mkeyw, spanrep, 1415 ignctxt=(), ignid=(), ignctxtsw=(), ignidsw=()): 1416 1417 if mkeyw is not None: 1418 if isinstance(mkeyw, str): 1419 mkeyw = [mkeyw] 1420 mkeyw = set(mkeyw) 1421 1422 # Lazy-evaluated data. 1423 ldata = {} 1424 def eval_ldata (): 1425 ldata["entities"] = _get_entities(entities) 1426 1427 def checkf (msgstr, msg, cat): 1428 1429 if ( mkeyw is not None 1430 and not mkeyw.intersection(cat.markup() or set()) 1431 ): 1432 return [] if spanrep else 0 1433 1434 if ( msg.msgctxt in ignctxt 1435 or msg.msgid in ignid 1436 or (msg.msgctxt is not None and msg.msgctxt.startswith(ignctxtsw)) 1437 or msg.msgid.startswith(ignidsw) 1438 ): 1439 return [] if spanrep else 0 1440 1441 if not ldata: 1442 eval_ldata() 1443 entities = ldata["entities"] 1444 1445 if ( flag_no_check_markup in manc_parse_flag_list(msg, "|") 1446 or ( not strict 1447 and ( check(msg.msgid, ents=entities) 1448 or check(msg.msgid_plural or "", ents=entities))) 1449 ): 1450 return [] if spanrep else 0 1451 spans = check(msgstr, ents=entities) 1452 if spanrep: 1453 return spans 1454 else: 1455 for span in spans: 1456 if span[2:]: 1457 report_on_msg(span[2], msg, cat) 1458 return len(spans) 1459 1460 return checkf 1461 1462 1463 # Cache for loaded entities, by entity specification string, 1464 # to speed up when several markup hooks are using the same setup. 1465 _loaded_entities_cache = {} 1466 1467 def _get_entities (entspec): 1468 1469 if not isinstance(entspec, str): 1470 return entspec 1471 1472 entities = _loaded_entities_cache.get(entspec) 1473 if entities is not None: 1474 return entities 1475 1476 entities = get_result_ireq(entspec) 1477 1478 _loaded_entities_cache[entspec] = entities 1479 return entities 1480 1481 1482 _docbook4_l1 = None 1483 1484 def validate_docbook4_l1 (text, ents=None): 1485 """ 1486 Validate Docbook 4.x markup in text against L{level1<collect_xml_spec_l1>} 1487 specification. 1488 1489 Markup definition is extended to include C{<placeholder-N/>} elements, 1490 which C{xml2po} uses to segment text when extracting markup documents 1491 into PO templates. 1492 1493 See L{validate_xml_l1} for description of the C{ents} parameter 1494 and the return value. 1495 1496 @param text: text to check 1497 @type text: string 1498 @param ents: set of known entities (in addition to default) 1499 @type ents: sequence 1500 1501 @returns: erroneous spans in the text 1502 @rtype: list of (int, int, string) tuples 1503 """ 1504 1505 global _docbook4_l1 1506 if _docbook4_l1 is None: 1507 specpath = os.path.join(datadir(), "spec", "docbook4.l1") 1508 _docbook4_l1 = collect_xml_spec_l1(specpath) 1509 1510 xmlfmt = _("@item markup type", "Docbook4") 1511 return validate_xml_l1(text, spec=_docbook4_l1, xmlfmt=xmlfmt, ents=ents) 1512 1513 1514 _db4_meta_msgctxt = set(( 1515 )) 1516 _db4_meta_msgid = set(( 1517 "translator-credits", 1518 )) 1519 _db4_meta_msgid_sw = ( 1520 "@@image:", 1521 ) 1522 1523 def check_docbook4 (strict=False, entities={}, mkeyw=None): 1524 """ 1525 Check XML markup in translations of Docbook 4.x catalogs [hook factory]. 1526 1527 See L{check_xml} for description of parameters. 1528 1529 @return: type S3C hook 1530 @rtype: C{(msgstr, msg, cat) -> numerr} 1531 """ 1532 1533 return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, False, 1534 ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt, 1535 ignidsw=_db4_meta_msgid_sw) 1536 1537 1538 def check_docbook4_sp (strict=False, entities={}, mkeyw=None): 1539 """ 1540 Like L{check_docbook4}, except that erroneous spans are returned 1541 instead of reporting problems to stdout [hook factory]. 1542 1543 @return: type V3C hook 1544 @rtype: C{(msgstr, msg, cat) -> spans} 1545 """ 1546 1547 return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, True, 1548 ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt, 1549 ignidsw=_db4_meta_msgid_sw) 1550 1551 1552 def check_docbook4_msg (strict=False, entities={}, mkeyw=None): 1553 """ 1554 Check for any known problem in translation in messages 1555 in Docbook 4.x catalogs [hook factory]. 1556 1557 Currently performed checks: 1558 - Docbook markup 1559 - cross-message insertion placeholders 1560 1561 See L{check_xml} for description of parameters. 1562 1563 @return: type V4A hook 1564 @rtype: C{(msg, cat) -> parts} 1565 """ 1566 1567 check_markup = check_docbook4_sp(strict, entities, mkeyw) 1568 1569 def checkf (msg, cat): 1570 1571 hl = [] 1572 for i in range(len(msg.msgstr)): 1573 spans = [] 1574 spans.extend(check_markup(msg.msgstr[i], msg, cat)) 1575 spans.extend(check_placeholder_els(msg.msgid, msg.msgstr[i])) 1576 if spans: 1577 hl.append(("msgstr", i, spans)) 1578 return hl 1579 1580 return checkf 1581 1582 1583 _entpath_html = os.path.join(datadir(), "spec", "html.entities") 1584 html_entities = read_entities(_entpath_html) 1585 1586 _html_l1 = None 1587 1588 def validate_html_l1 (text, ents=None): 1589 """ 1590 Validate HTML markup in text against L{level1<collect_xml_spec_l1>} 1591 specification. 1592 1593 At the moment, this function can only check HTML markup if well-formed 1594 in the XML sense, although HTML allows omission of some closing tags. 1595 1596 See L{validate_xml_l1} for description of the C{ents} parameter 1597 and the return value. 1598 1599 @param text: text to check 1600 @type text: string 1601 @param ents: set of known entities (in addition to default) 1602 @type ents: sequence 1603 1604 @returns: erroneous spans in the text 1605 @rtype: list of (int, int, string) tuples 1606 """ 1607 1608 global _html_l1 1609 if _html_l1 is None: 1610 specpath = os.path.join(datadir(), "spec", "html.l1") 1611 _html_l1 = collect_xml_spec_l1(specpath) 1612 1613 if ents is not None: 1614 ents = Multidict([ents, html_entities]) 1615 1616 xmlfmt = _("@item markup type", "HTML") 1617 return validate_xml_l1(text, spec=_html_l1, xmlfmt=xmlfmt, ents=ents, 1618 accelamp=True, casesens=False) 1619 1620 1621 def check_html (strict=False, entities={}, mkeyw=None): 1622 """ 1623 Check HTML markup in translations [hook factory]. 1624 1625 See L{check_xml} for description of parameters. 1626 See notes on checking HTML markup to L{validate_html_l1}. 1627 1628 @return: type S3C hook 1629 @rtype: C{(msgstr, msg, cat) -> numerr} 1630 """ 1631 1632 return _check_xml_w(validate_html_l1, strict, entities, mkeyw, False) 1633 1634 1635 def check_html_sp (strict=False, entities={}, mkeyw=None): 1636 """ 1637 Like L{check_html}, except that erroneous spans are returned 1638 instead of reporting problems to stdout [hook factory]. 1639 1640 @return: type V3C hook 1641 @rtype: C{(msgstr, msg, cat) -> spans} 1642 """ 1643 1644 return _check_xml_w(validate_html_l1, strict, entities, mkeyw, True) 1645 1646 1647 _qtrich_l1 = None 1648 1649 def validate_qtrich_l1 (text, ents=None): 1650 """ 1651 Validate Qt rich-text markup in text against L{level1<collect_xml_spec_l1>} 1652 specification. 1653 1654 At the moment, this function can only check Qt rich-text if well-formed 1655 in the XML sense, although Qt rich-text allows HTML-type omission of 1656 closing tags. 1657 1658 See L{validate_xml_l1} for description of the C{ents} parameter 1659 and the return value. 1660 1661 @param text: text to check 1662 @type text: string 1663 @param ents: set of known entities (in addition to default) 1664 @type ents: sequence 1665 1666 @returns: erroneous spans in the text 1667 @rtype: list of (int, int, string) tuples 1668 """ 1669 1670 global _qtrich_l1 1671 if _qtrich_l1 is None: 1672 specpath = os.path.join(datadir(), "spec", "qtrich.l1") 1673 _qtrich_l1 = collect_xml_spec_l1(specpath) 1674 1675 if ents is not None: 1676 ents = Multidict([ents, html_entities]) 1677 1678 xmlfmt = _("@item markup type", "Qt-rich") 1679 return validate_xml_l1(text, spec=_qtrich_l1, xmlfmt=xmlfmt, ents=ents, 1680 accelamp=True, casesens=False) 1681 1682 1683 def check_qtrich (strict=False, entities={}, mkeyw=None): 1684 """ 1685 Check Qt rich-text markup in translations [hook factory]. 1686 1687 See L{check_xml} for description of parameters. 1688 See notes on checking Qt rich-text to L{validate_qtrich_l1}. 1689 1690 @return: type S3C hook 1691 @rtype: C{(msgstr, msg, cat) -> numerr} 1692 """ 1693 1694 return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, False) 1695 1696 1697 def check_qtrich_sp (strict=False, entities={}, mkeyw=None): 1698 """ 1699 Like L{check_qtrich}, except that erroneous spans are returned 1700 instead of reporting problems to stdout [hook factory]. 1701 1702 @return: type V3C hook 1703 @rtype: C{(msgstr, msg, cat) -> spans} 1704 """ 1705 1706 return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, True) 1707 1708 1709 _entpath_kuit = os.path.join(datadir(), "spec", "kuit.entities") 1710 kuit_entities = read_entities(_entpath_kuit) 1711 1712 _kuit_l1 = None 1713 1714 def validate_kuit_l1 (text, ents=None): 1715 """ 1716 Validate KUIT markup in text against L{level1<collect_xml_spec_l1>} 1717 specification. 1718 1719 KUIT is the semantic markup for user interface in KDE4. 1720 1721 See L{validate_xml_l1} for description of the C{ents} parameter 1722 and the return value. 1723 1724 @param text: text to check 1725 @type text: string 1726 @param ents: set of known entities (in addition to default) 1727 @type ents: sequence 1728 1729 @returns: erroneous spans in the text 1730 @rtype: list of (int, int, string) tuples 1731 """ 1732 1733 global _kuit_l1 1734 if _kuit_l1 is None: 1735 specpath = os.path.join(datadir(), "spec", "kuit.l1") 1736 _kuit_l1 = collect_xml_spec_l1(specpath) 1737 1738 if ents is not None: 1739 ents = Multidict([ents, kuit_entities]) 1740 1741 xmlfmt = _("@item markup type", "KUIT") 1742 return validate_xml_l1(text, spec=_kuit_l1, xmlfmt=xmlfmt, ents=ents, 1743 accelamp=True) 1744 1745 1746 _kde4_l1 = None 1747 _kde4_ents = None 1748 1749 def validate_kde4_l1 (text, ents=None): 1750 """ 1751 Validate markup in texts used in KDE4 GUI. 1752 1753 KDE4 GUI texts may contain both Qt rich-text and KUIT markup, 1754 even mixed in the same text. 1755 1756 See L{validate_xml_l1} for description of the C{ents} parameter 1757 and the return value. 1758 1759 @param text: text to check 1760 @type text: string 1761 @param ents: set of known entities (in addition to default) 1762 @type ents: sequence 1763 1764 @returns: erroneous spans in the text 1765 @rtype: list of (int, int, string) tuples 1766 """ 1767 1768 global _kde4_l1, _kde4_ents 1769 if _kde4_l1 is None: 1770 _kde4_l1 = {} 1771 spath1 = os.path.join(datadir(), "spec", "qtrich.l1") 1772 _kde4_l1.update(collect_xml_spec_l1(spath1)) 1773 spath2 = os.path.join(datadir(), "spec", "kuit.l1") 1774 _kde4_l1.update(collect_xml_spec_l1(spath2)) 1775 _kde4_ents = {} 1776 _kde4_ents.update(html_entities) 1777 _kde4_ents.update(kuit_entities) 1778 1779 if ents is not None: 1780 ents = Multidict([ents, _kde4_ents]) 1781 1782 xmlfmt = _("@item markup type", "KDE4") 1783 return validate_xml_l1(text, spec=_kde4_l1, xmlfmt=xmlfmt, ents=ents, 1784 accelamp=True, casesens=False) 1785 1786 1787 def check_kde4 (strict=False, entities={}, mkeyw=None): 1788 """ 1789 Check XML markup in translations of KDE4 UI catalogs [hook factory]. 1790 1791 See L{check_xml} for description of parameters. 1792 1793 @return: type S3C hook 1794 @rtype: C{(msgstr, msg, cat) -> numerr} 1795 """ 1796 1797 return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, False) 1798 1799 1800 def check_kde4_sp (strict=False, entities={}, mkeyw=None): 1801 """ 1802 Like L{check_kde4}, except that erroneous spans are returned 1803 instead of reporting problems to stdout [hook factory]. 1804 1805 @return: type V3C hook 1806 @rtype: C{(msgstr, msg, cat) -> spans} 1807 """ 1808 1809 return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, True) 1810 1811 1812 _pango_l1 = None 1813 1814 def validate_pango_l1 (text, ents=None): 1815 """ 1816 Validate Pango markup in text against L{level1<collect_xml_spec_l1>} 1817 specification. 1818 1819 See L{validate_xml_l1} for description of the C{ents} parameter 1820 and the return value. 1821 1822 @param text: text to check 1823 @type text: string 1824 @param ents: set of known entities (in addition to default) 1825 @type ents: sequence 1826 1827 @returns: erroneous spans in the text 1828 @rtype: list of (int, int, string) tuples 1829 """ 1830 1831 global _pango_l1 1832 if _pango_l1 is None: 1833 specpath = os.path.join(datadir(), "spec", "pango.l1") 1834 _pango_l1 = collect_xml_spec_l1(specpath) 1835 1836 if ents is not None: 1837 ents = Multidict([ents, html_entities]) 1838 1839 xmlfmt = _("@item markup type", "Pango") 1840 return validate_xml_l1(text, spec=_pango_l1, xmlfmt=xmlfmt, ents=ents, 1841 accelamp=True, casesens=False) 1842 1843 1844 def check_pango (strict=False, entities={}, mkeyw=None): 1845 """ 1846 Check XML markup in translations of Pango UI catalogs [hook factory]. 1847 1848 See L{check_xml} for description of parameters. 1849 1850 @return: type S3C hook 1851 @rtype: C{(msgstr, msg, cat) -> numerr} 1852 """ 1853 1854 return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, False) 1855 1856 1857 def check_pango_sp (strict=False, entities={}, mkeyw=None): 1858 """ 1859 Like L{check_pango}, except that erroneous spans are returned 1860 instead of reporting problems to stdout [hook factory]. 1861 1862 @return: type V3C hook 1863 @rtype: C{(msgstr, msg, cat) -> spans} 1864 """ 1865 1866 return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, True) 1867 1868 1869 1870 1871 _digits_dec = set("0123456789") 1872 _digits_hex = set("0123456789abcdefABCDEF") 1873 1874 def nument_to_char (nument): 1875 """ 1876 Convert numeric XML entity to character. 1877 1878 Numeric XML entities can be decimal, C{&#DDDD;}, or hexadecimal, 1879 C{&#xHHHH;}, where C{D} and C{H} stand for number system's digits. 1880 4 digits is the maximum, but there can be less. 1881 1882 If the entity cannot be converted to a character, for whatever reason, 1883 C{None} is reported. 1884 1885 @param nument: numeric entity, with or without C{&} and C{;} 1886 @type nument: string 1887 1888 @return: character represented by the entity 1889 @rtype: string or None 1890 """ 1891 1892 if nument[:1] == "&": 1893 nument = nument[1:-1] 1894 1895 if nument[:1] != "#": 1896 return None 1897 1898 if nument[1:2] == "x": 1899 known_digits = _digits_hex 1900 numstr = nument[2:] 1901 base = 16 1902 else: 1903 known_digits = _digits_dec 1904 numstr = nument[1:] 1905 base = 10 1906 1907 if len(numstr) > 4 or len(numstr) < 1: 1908 return None 1909 1910 unknown_digits = set(numstr).difference(known_digits) 1911 if unknown_digits: 1912 return None 1913 1914 return chr(int(numstr, base)) 1915 1916 1917 def validate_xmlents (text, ents={}, default=False, numeric=False): 1918 """ 1919 Check whether XML-like entities in the text are among known. 1920 1921 The text does not have to be XML markup as such. 1922 No XML parsing is performed, only the raw search for XML-like entities. 1923 1924 @param text: text with entities to check 1925 @type text: string 1926 @param ents: known entities 1927 @type ents: sequence 1928 @param default: whether default XML entities are allowed (C{&}, etc.) 1929 @type default: bool 1930 @param numeric: whether numeric character entities are allowed 1931 @type numeric: bool 1932 1933 @returns: erroneous spans in the text 1934 @rtype: list of (int, int, string) tuples 1935 """ 1936 1937 spans = [] 1938 1939 p = 0 1940 while True: 1941 p = text.find("&", p) 1942 if p < 0: 1943 break 1944 pp = p 1945 m = _entity_rx.match(text, p) 1946 if m: 1947 p = m.end() 1948 ent = m.group(1) 1949 errmsg = None 1950 if numeric and ent.startswith("#"): 1951 if nument_to_char(ent) is None: 1952 errmsg = _("@info", 1953 "Invalid numeric entity '%(ent)s'.", 1954 ent=ent) 1955 elif ent not in ents and (not default or ent not in xml_entities): 1956 nearents = [] #difflib.get_close_matches(ent, ents) 1957 if nearents: 1958 if len(nearents) > 5: # do not overwhelm message 1959 fmtents = format_item_list(nearents[:5], incmp=True) 1960 else: 1961 fmtents = format_item_list(nearents) 1962 errmsg = _("@info", 1963 "Unknown entity '%(ent)s' " 1964 "(suggestions: %(entlist)s).", 1965 ent=ent, entlist=fmtents) 1966 else: 1967 errmsg = _("@info", 1968 "Unknown entity '%(ent)s'.", 1969 ent=ent) 1970 1971 if errmsg is not None: 1972 spans.append((pp, p, errmsg)) 1973 else: 1974 p += 1 1975 1976 return spans 1977 1978 1979 def check_xmlents (strict=False, entities={}, mkeyw=None, 1980 default=False, numeric=False): 1981 """ 1982 Check existence of XML entities in translations [hook factory]. 1983 1984 See L{check_xml} for description of parameters C{strict}, C{entities}, 1985 and C{mkeyw}. See L{validate_xmlents} for parameters C{default} and 1986 C{numeric}, and for general notes on checking entities. 1987 1988 @return: type S3C hook 1989 @rtype: C{(msgstr, msg, cat) -> numerr} 1990 """ 1991 1992 def check (text, ents): 1993 return validate_xmlents(text, ents, default=default, numeric=numeric) 1994 1995 return _check_xml_w(check, strict, entities, mkeyw, False) 1996 1997 1998 def check_xmlents_sp (strict=False, entities={}, mkeyw=None, 1999 default=False, numeric=False): 2000 """ 2001 Like L{check_xmlents}, except that erroneous spans are returned 2002 instead of reporting problems to stdout [hook factory]. 2003 2004 @return: type V3C hook 2005 @rtype: C{(msgstr, msg, cat) -> spans} 2006 """ 2007 2008 def check (text, ents): 2009 return validate_xmlents(text, ents, default=default, numeric=numeric) 2010 2011 return _check_xml_w(check, strict, entities, mkeyw, True) 2012 2013 2014 _placeholder_el_rx = re.compile(r"<\s*placeholder-(\d+)\s*/\s*>") 2015 2016 def check_placeholder_els (orig, trans): 2017 """ 2018 Check if sets of C{<placeholder-N/>} elements are matching between 2019 original and translated text. 2020 2021 C{<placeholder-N/>} elements are added into text by C{xml2po}, 2022 for finer segmentation of markup documents extracted into PO templates. 2023 2024 See L{validate_xml_l1} for description of the return value. 2025 2026 @param orig: original text 2027 @type orig: string 2028 @param trans: translated text 2029 @type trans: string 2030 2031 @returns: erroneous spans in translation 2032 @rtype: list of (int, int, string) tuples 2033 """ 2034 2035 spans = [] 2036 2037 orig_plnums = set() 2038 for m in _placeholder_el_rx.finditer(orig): 2039 orig_plnums.add(m.group(1)) 2040 trans_plnums = set() 2041 for m in _placeholder_el_rx.finditer(trans): 2042 trans_plnums.add(m.group(1)) 2043 2044 missing_plnums = list(orig_plnums.difference(trans_plnums)) 2045 extra_plnums = list(trans_plnums.difference(orig_plnums)) 2046 if missing_plnums: 2047 tags = "".join(["<placeholder-%s/>" % x for x in missing_plnums]) 2048 errmsg = _("@info", 2049 "Missing placeholder tags in translation: %(taglist)s.", 2050 taglist=format_item_list(tags)) 2051 spans.append((0, 0, errmsg)) 2052 elif extra_plnums: # do not report both, single glitch may cause them 2053 tags = "".join(["<placeholder-%s/>" % x for x in extra_plnums]) 2054 errmsg = _("@info", 2055 "Superfluous placeholder tags in translation: %(taglist)s.", 2056 taglist=format_item_list(tags)) 2057 spans.append((0, 0, errmsg)) 2058 2059 return spans 2060