pology/pology/markup.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Convert and validate markup in text.
0005
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009
0010 import os
0011 import re
0012 import codecs
0013 import xml.parsers.expat
0014 import difflib
0015
0016 from pology import PologyError, datadir, _, n_
0017 from pology.comments import manc_parse_flag_list
0018 from pology.diff import adapt_spans
0019 from pology.entities import read_entities
0020 from pology.getfunc import get_result_ireq
0021 from pology.msgreport import report_on_msg
0022 from pology.multi import Multidict
0023 from pology.report import format_item_list
0024
0025
0026 # Pipe flag used to manually prevent check for a particular message.
0027 flag_no_check_markup = "no-check-markup"
0028
0029
0030 _nlgr_rx = re.compile(r"\n{2,}")
0031 _wsgr_rx = re.compile(r"\s+", re.ASCII)
0032
0033 def plain_to_unwrapped (text):
0034     """
0035     Convert wrapped plain text to unwrapped.
0036
0037     Two or more newlines are considered as paragraph boundaries and left in,
0038     while all other newlines are removed.
0039     Whitespace in the text is simplified throughout.
0040
0041     @param text: text to unwrap
0042     @type text: string
0043
0044     @returns: unwrapped text
0045     @rtype: string
0046     """
0047
0048     # Strip leading and trailing whitespace.
0049     text = text.strip()
0050
0051     # Strip leading and trailing whitespace in all lines.
0052     text = "\n".join([x.strip() for x in text.split("\n")])
0053
0054     # Mask all paragraph breaks.
0055     pbmask = "\x04\x04"
0056     text = _nlgr_rx.sub(pbmask, text)
0057
0058     # Replace all whitespace groups with single space.
0059     text = _wsgr_rx.sub(" ", text)
0060
0061     # Unmask paragraph breaks.
0062     text = text.replace(pbmask, "\n\n")
0063
0064     return text
0065
0066
0067 xml_entities = {
0068     "lt": "<",
0069     "gt": ">",
0070     "apos": "'",
0071     "quot": "\"",
0072     "amp": "&",
0073 }
0074
0075 WS_SPACE = "\x04~sp"
0076 WS_TAB = "\x04~tb"
0077 WS_NEWLINE = "\x04~nl"
0078 _ws_masks = {
0079     WS_SPACE: " ",
0080     WS_TAB: "\t",
0081     WS_NEWLINE: "\n",
0082 }
0083 _ws_unmasks = dict([(y, x) for x, y in list(_ws_masks.items())])
0084
0085 def xml_to_plain (text, tags=None, subs={}, ents={}, keepws=set(),
0086                   ignels=set()):
0087     """
0088     Convert any XML-like markup to plain text.
0089
0090     By default, all tags in the text are replaced with a single space;
0091     entities, unless one of the XML default (C{&lt;}, C{&gt;}, C{&amp;},
0092     C{&quot;}, C{&apos;}), are left untouched;
0093     all whitespace groups are simplified to single space and leading and
0094     trailing removed.
0095
0096     If only a particular subset of tags should be taken into account, it can
0097     be specified by the C{tags} parameter, as a sequence of tag names
0098     (the sequence is internally converted to set before processing).
0099
0100     If a tag should be replaced with a special sequence of characters
0101     (either opening or closing tag), or the text wrapped by it replaced too,
0102     this can be specified by the C{subs} parameter. It is a dictionary of
0103     3-tuples by tag name, which tells what to replace with the opening tag,
0104     the closing tag, and the wrapped text. For example, to replace
0105     C{<i>foobar</i>} with C{/foobar/}, the dictionary entry would be
0106     C{{"i": ("/", "/", None)}} (where final C{None} states not to touch
0107     the wrapped text); to replace C{<code>...</code>} with C{@@@}
0108     (i.e. remove code segment completely but leave in a marker that there
0109     was something), the entry is C{{"code": ("", "", "@@@")}}.
0110     The replacement for the wrapped text can also be a function,
0111     taking a string and returning a string.
0112     Note that whitespace is automatically simplified, so if whitespace
0113     given by the replacements should be exactly preserved, use C{WS_*}
0114     string constants in place of corresponding whitespace characters.
0115
0116     To have some entities other than the XML default replaced with proper
0117     values, a dictionary of known entities with values may be provided using
0118     the C{ents} parameter.
0119
0120     Whitespace can be preserved within some elements, as given by
0121     their tags in the C{keepws} sequence.
0122
0123     Some elements may be completely removed, as given by the C{ignels} sequence.
0124     Each element of the sequence should either be a tag, or a (tag, type) tuple,
0125     where type is the value of the C{type} argument to element, if any.
0126
0127     It is assumed that the markup is well-formed, and if it is not
0128     the result is undefined; but best attempt at conversion is made.
0129
0130     There are several other functions in this module which deal with well known
0131     markups, such that it is not necessary to use this function with
0132     C{tags}, C{subs}, or C{ents} manually specified.
0133
0134     If you only want to resolve entities from a known set, instead of
0135     calling this function with empty C{tags} and entities given in C{ents},
0136     consider using the more powerfull L{pology.resolve.resolve_entities}.
0137
0138     @param text: markup text to convert to plain
0139     @type text: string
0140     @param tags: known tags
0141     @type tags: sequence of strings
0142     @param subs: replacement specification
0143     @type subs: dictionary of 3-tuples
0144     @param ents: known entities and their values
0145     @type ents: dictionary
0146     @param keepws: tags of elements in which to preserve whitespace
0147     @type keepws: sequence of strings
0148     @param ignels: tags or tag/types or elements to completely remove
0149     @type ignels: sequence of strings and (string, string) tuples
0150
0151     @returns: plain text version
0152     @rtype: string
0153     """
0154
0155     # Convert some sequences to sets, for faster membership checks.
0156     if tags is not None and not isinstance(tags, set):
0157         tags = set(tags)
0158     if not isinstance(keepws, set):
0159         keepws = set(keepws)
0160     if not isinstance(ignels, set):
0161         ignels = set(ignels)
0162
0163     # Resolve user-supplied entities before tags,
0164     # as they may contain more markup.
0165     # (Resolve default entities after tags,
0166     # because the default entities can introduce invalid markup.)
0167     text = _resolve_ents(text, ents, xml_entities)
0168
0169     # Build element tree, trying to work around badly formed XML
0170     # (but do note when the closing element is missing).
0171     # Element tree is constructed as list of tuples:
0172     # (tag, opening_tag_literal, closing_tag_literal, atype, content)
0173     # where atype is the value of type attribute (if any),
0174     # and content is a sublist for given element;
0175     # tag may be #text, when the content is string.
0176     eltree = []
0177     curel = eltree
0178     parent = []
0179     any_tag = False
0180     p = 0
0181     while True:
0182         pp = p
0183         p = text.find("<", p)
0184         if p < 0:
0185             break
0186         curel.append(("#text", None, None, None, text[pp:p]))
0187         tag_literal, tag, atype, opening, closing, p = _parse_tag(text, p)
0188         if p < 0:
0189             break
0190         if opening: # opening tag
0191             any_tag = True
0192             curel.append([tag, tag_literal, None, atype, []])
0193             parent.append(curel)
0194             curel = curel[-1][-1]
0195         if closing: # closing tag (can be both opening and closing)
0196             if parent:
0197                 curel = parent.pop()
0198                 if not opening:
0199                     # Record closing tag literal if not opening as well.
0200                     curel[-1][2] = tag_literal
0201             else: # faulty markup, move top element
0202                 eltree = [[tag, None, tag_literal, None, curel]]
0203                 curel = eltree
0204     curel.append(("#text", None, None, None, text[pp:]))
0205
0206     # Replace tags.
0207     text = _resolve_tags(eltree, tags, subs, keepws, ignels)
0208
0209     # Resolve default entities.
0210     text = _resolve_ents(text, xml_entities)
0211
0212     return text
0213
0214
0215 def _parse_tag (text, p):
0216     # text[p] must be "<"
0217
0218     tag = ""
0219     atype = None
0220     opening = True
0221     closing = False
0222
0223     tlen = len(text)
0224     pp = p
0225     in_str = False
0226     in_tag = False
0227     in_attr = False
0228     in_lead = True
0229     in_afterslash = False
0230     in_aftereq = False
0231     in_aftertag = False
0232     in_afterattr = False
0233     ntag = ""
0234     nattr = ""
0235     while True:
0236         p += 1
0237         if p >= tlen:
0238             break
0239
0240         if in_lead and not text[p].isspace():
0241             in_lead = False
0242             opening = text[p] != "/"
0243             if opening:
0244                 in_tag = True
0245                 p_tag = p
0246             else:
0247                 in_afterslash = True
0248         elif in_afterslash and not text[p].isspace():
0249             in_afterslash = False
0250             in_tag = True
0251             p_tag = p
0252         elif in_tag and (text[p].isspace() or text[p] in "/>"):
0253             in_tag = False
0254             in_aftertag = True
0255             tag = text[p_tag:p]
0256             ntag = tag.lower()
0257         elif in_aftertag and not (text[p].isspace() or text[p] in "/>"):
0258             in_aftertag = False
0259             in_attr = True
0260             p_attr = p
0261         elif in_attr and (text[p].isspace() or text[p] in "=/>"):
0262             in_attr = False
0263             if text[p] != "=":
0264                 in_afterattr = True
0265             else:
0266                 in_aftereq = True
0267             attr = text[p_attr:p]
0268             nattr = attr.lower()
0269         elif in_aftereq and text[p] in ('"', "'"):
0270             in_aftereq = False
0271             in_str = True
0272             quote_char = text[p]
0273             p_str = p + 1
0274         elif in_str and text[p] == quote_char:
0275             in_str = False
0276             s = text[p_str:p].strip().replace(" ", "")
0277             if nattr == "type":
0278                 atype = s
0279         elif in_afterattr and text[p] == "=":
0280             in_afterattr = False
0281             in_aftereq = True
0282
0283         if not in_str and text[p] == "/":
0284             closing = True
0285         if not in_str and text[p] == ">":
0286             break
0287
0288     p += 1
0289     tag_literal = text[pp:p]
0290
0291     return tag_literal, tag, atype, opening, closing, p
0292
0293
0294 _entity_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U)
0295
0296 def _resolve_ents (text, ents={}, ignents={}):
0297     """
0298     Resolve XML entities as described in L{xml_to_plain}, ignoring some.
0299     """
0300
0301     # There may be entities within entities, so replace entities in each
0302     # entity value too before substituting in the main text.
0303     ntext = []
0304     p = 0
0305     while True:
0306         pp = p
0307         p = text.find("&", p)
0308         if p < 0:
0309             break
0310         ntext.append(text[pp:p])
0311         m = _entity_rx.match(text, p)
0312         if m:
0313             name = m.group(1)
0314             if name not in ignents:
0315                 value = ents.get(name)
0316                 if value is not None:
0317                     # FIXME: Endless recursion if the entity repeats itself.
0318                     value = _resolve_ents(value, ents, ignents)
0319                     ntext.append(value)
0320                 else:
0321                     # Put entity back as-is.
0322                     ntext.append(m.group(0))
0323             else: # ignored entity, do not touch
0324                 ntext.append(text[p:m.span()[1]])
0325             p = m.span()[1]
0326         else:
0327             ntext.append(text[p]) # the ampersand
0328             p += 1
0329     ntext.append(text[pp:])
0330     text = "".join(ntext)
0331
0332     return text
0333
0334
0335 # Ordinary around masked whitespace.
0336 _wsgr_premask_rx = re.compile(r"\s+(\x04~\w\w)")
0337 _wsgr_postmask_rx = re.compile(r"(\x04~\w\w)\s+")
0338
0339 def _resolve_tags (elseq, tags=None, subs={}, keepws=set(), ignels=set()):
0340     """
0341     Replace XML tags as described in L{xml_to_plain}, given the parsed tree.
0342     Split into top and recursive part.
0343     """
0344
0345     # Text with masked whitespace where significant.
0346     text = _resolve_tags_r(elseq, tags, subs, keepws, ignels)
0347
0348     # Simplify whitespace.
0349     text = _wsgr_rx.sub(" ", text)
0350     text = _wsgr_premask_rx.sub(r"\1", text)
0351     text = _wsgr_postmask_rx.sub(r"\1", text)
0352     text = text.strip()
0353
0354     # Unmask significant whitespace.
0355     text = _unmask_ws(text)
0356
0357     # Remove excess newlines even if supposedly significant.
0358     text = text.strip("\n")
0359     text = _nlgr_rx.sub("\n\n", text)
0360
0361     return text
0362
0363
0364 def _resolve_tags_r (elseq, tags=None, subs={}, keepws=set(), ignels=set()):
0365
0366     segs = []
0367     for el in elseq:
0368         if el[0] in ignels or (el[0], el[3]) in ignels:
0369             # Complete element is ignored (by tag, or tag/type).
0370             continue
0371
0372         if el[0] == "#text":
0373             segs.append(el[-1])
0374         elif tags is None or el[0] in tags:
0375             repl_pre, repl_post, repl_cont = subs.get(el[0], [" ", " ", None])
0376             if repl_pre is None:
0377                 repl_pre = ""
0378             if repl_post is None:
0379                 repl_post = ""
0380             repl_cont_orig = repl_cont
0381             if not isinstance(repl_cont, str):
0382                 repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels)
0383                 if el[0] in keepws:
0384                     # Mask whitespace in wrapped text.
0385                     repl_cont = _mask_ws(repl_cont)
0386             if callable(repl_cont_orig):
0387                 repl_cont = repl_cont_orig(repl_cont)
0388             # If space not significant,
0389             # find first non-whitespace characters in wrapped text
0390             # and shift them before surrounding replacements.
0391             if el[0] not in keepws:
0392                 lcont = len(repl_cont)
0393                 p1 = 0
0394                 while p1 < lcont and repl_cont[p1].isspace():
0395                     p1 += 1
0396                 p2 = lcont - 1
0397                 while p2 > 0 and repl_cont[p2].isspace():
0398                     p2 -= 1
0399                 repl_pre = repl_cont[:p1] + repl_pre
0400                 repl_post = repl_post + repl_cont[p2+1:]
0401                 repl_cont = repl_cont[p1:p2+1]
0402             segs.append(repl_pre + repl_cont + repl_post)
0403         else:
0404             # Ignored tag, put back verbatim.
0405             repl_pre = el[1]
0406             if repl_pre is None:
0407                 repl_pre = ""
0408             repl_post = el[2]
0409             if repl_post is None:
0410                 repl_post = ""
0411             repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels)
0412             segs.append(repl_pre + repl_cont + repl_post)
0413
0414     return "".join(segs)
0415
0416
0417 def _mask_ws (text):
0418
0419     for mask, ws in list(_ws_masks.items()):
0420         text = text.replace(ws, mask)
0421     return text
0422
0423
0424 def _unmask_ws (text):
0425
0426     for mask, ws in list(_ws_masks.items()):
0427         text = text.replace(mask, ws)
0428     return text
0429
0430 _html_tags = set("""
0431     a address applet area b base basefont big blockquote body br button
0432     caption center cite code col colgroup dd del dfn dir div dl dt
0433     em fieldset font form frame frameset h1 h2 h3 h4 h5 h6 head hr html
0434     i iframe img input ins isindex kbd label legend li link map menu meta
0435     noframes noscript ol option p param pre
0436     s samp script select small span strike strong style sub sup
0437     table tbody td textarea tfoot th thead title tr tt u ul var xmp
0438 """.split())
0439 _html_subs = {
0440     "_nows" : ("", "", None),
0441     "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None),
0442 }
0443 _html_subs.update([(x, _html_subs["_nows"]) for x in _html_tags])
0444 _html_subs.update([(x, _html_subs["_parabr"]) for x in
0445                    "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr"
0446                    "".split()])
0447 _html_ents = { # in addition to default XML entities
0448     "nbsp": "\xa0",
0449 }
0450 _html_keepws = set("""
0451     code pre xmp
0452 """.split())
0453 _html_ignels = set([
0454     ("style", "text/css"),
0455 ])
0456
0457 def html_to_plain (text):
0458     """
0459     Convert HTML markup to plain text.
0460
0461     @param text: HTML text to convert to plain
0462     @type text: string
0463
0464     @returns: plain text version
0465     @rtype: string
0466     """
0467
0468     return xml_to_plain(text, _html_tags, _html_subs, _html_ents,
0469                               _html_keepws, _html_ignels)
0470
0471
0472 def html_plain (*args, **kwargs):
0473     """
0474     Deprecated name for L{html_to_plain}.
0475     """
0476     return html_to_plain(*args, **kwargs)
0477
0478
0479 _qtrich_tags = set("""
0480     qt html
0481     a b big blockquote body br center cite code dd dl dt em font
0482     h1 h2 h3 h4 h5 h6 head hr i img li meta nobr ol p pre
0483     s span strong style sub sup table td th tr tt u ul var
0484 """.split())
0485 _qtrich_subs = {
0486     "_nows" : ("", "", None),
0487     "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None),
0488 }
0489 _qtrich_subs.update([(x, _qtrich_subs["_nows"]) for x in _qtrich_tags])
0490 _qtrich_subs.update([(x, _qtrich_subs["_parabr"]) for x in
0491                    "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr"
0492                    "".split()])
0493 _qtrich_ents = { # in addition to default XML entities
0494     "nbsp": "\xa0",
0495 }
0496 _qtrich_keepws = set("""
0497     code pre
0498 """.split())
0499 _qtrich_ignels = set([
0500     ("style", "text/css"),
0501 ])
0502
0503 def qtrich_to_plain (text):
0504     """
0505     Convert Qt rich-text markup to plain text.
0506
0507     @param text: Qt rich text to convert to plain
0508     @type text: string
0509
0510     @returns: plain text version
0511     @rtype: string
0512     """
0513
0514     return xml_to_plain(text, _qtrich_tags, _qtrich_subs, _qtrich_ents,
0515                               _qtrich_keepws, _qtrich_ignels)
0516
0517
0518 _kuit_tags = set("""
0519     kuit kuil title subtitle para list item note warning
0520     filename link application command resource icode bcode shortcut interface
0521     emphasis placeholder email envar message numid nl
0522 """.split())
0523 _kuit_subs = {
0524     "_nows" : ("", "", None),
0525     "_parabr" : ("", WS_NEWLINE*2, None),
0526     "_ws" : (" ", " ", None),
0527     "_ui" : ("[", "]", None),
0528 }
0529 _kuit_subs.update([(x, _kuit_subs["_nows"]) for x in _kuit_tags])
0530 _kuit_subs.update([(x, _kuit_subs["_ws"]) for x in
0531                    "placeholder".split()])
0532 _kuit_subs.update([(x, _kuit_subs["_parabr"]) for x in
0533                    "title subtitle para item nl"
0534                    "".split()])
0535 _kuit_subs.update([(x, _kuit_subs["_ui"]) for x in
0536                    "interface".split()])
0537 _kuit_ents = { # in addition to default XML entities
0538 }
0539 _kuit_keepws = set("""
0540     icode bcode
0541 """.split())
0542 _kuit_ignels = set([
0543 ])
0544
0545 def kuit_to_plain (text):
0546     """
0547     Convert KUIT markup to plain text.
0548
0549     @param text: KUIT text to convert to plain
0550     @type text: string
0551
0552     @returns: plain text version
0553     @rtype: string
0554     """
0555
0556     return xml_to_plain(text, _kuit_tags, _kuit_subs, _kuit_ents,
0557                               _kuit_keepws, _kuit_ignels)
0558
0559
0560 _htkt_tags = set(list(_qtrich_tags) + list(_kuit_tags))
0561 _htkt_subs = dict(list(_qtrich_subs.items()) + list(_kuit_subs.items()))
0562 _htkt_ents = dict(list(_qtrich_ents.items()) + list(_kuit_ents.items()))
0563 _htkt_keepws = set(list(_qtrich_keepws) + list(_kuit_keepws))
0564 _htkt_ignels = set(list(_qtrich_ignels) + list(_kuit_ignels))
0565
0566 def kde4_to_plain (text):
0567     """
0568     Convert KDE4 GUI markup to plain text.
0569
0570     KDE4 GUI texts may contain both Qt rich-text and KUIT markup,
0571     even mixed in the same text.
0572     Note that the conversion cannot be achieved, in general, by first
0573     converting Qt rich-text, and then KUIT, or vice versa.
0574     For example, if the text has C{&lt;} entity, after first conversion
0575     it will become plain C{<}, and interfere with second conversion.
0576
0577     @param text: KDE4 text to convert to plain
0578     @type text: string
0579
0580     @returns: plain text version
0581     @rtype: string
0582     """
0583
0584     return xml_to_plain(text, _htkt_tags, _htkt_subs, _htkt_ents,
0585                               _htkt_keepws, _htkt_ignels)
0586
0587
0588 # Assembled on first use.
0589 _dbk_tags = None
0590 _dbk_subs = None
0591 _dbk_ents = None
0592 _dbk_keepws = None
0593 _dbk_ignels = None
0594
0595 def _prep_docbook4_to_plain ():
0596
0597     global _dbk_tags, _dbk_subs, _dbk_ents, _dbk_keepws, _dbk_ignels
0598
0599     specpath = os.path.join(datadir(), "spec", "docbook4.l1")
0600     docbook4_l1 = collect_xml_spec_l1(specpath)
0601     _dbk_tags = set(docbook4_l1.keys())
0602
0603     _dbk_subs = {
0604         "_nows" : ("", "", None),
0605         "_parabr" : ("", WS_NEWLINE*2, None),
0606         "_ws" : (" ", " ", None),
0607         "_ui" : ("[", "]", None),
0608         "_uipath" : ("", "", lambda s: re.sub(r"\]\s*\[", "->", s, re.U)),
0609     }
0610     _dbk_subs.update([(x, _dbk_subs["_nows"]) for x in _dbk_tags])
0611     _dbk_subs.update([(x, _dbk_subs["_parabr"]) for x in
0612                       "para title".split()]) # FIXME: Add more.
0613     _dbk_subs.update([(x, _dbk_subs["_ws"]) for x in
0614                        "contrib address firstname placeholder surname "
0615                        "primary secondary "
0616                        "".split()])
0617     _dbk_subs.update([(x, _dbk_subs["_ui"]) for x in
0618                        "guilabel guibutton guiicon guimenu guisubmenu "
0619                        "guimenuitem "
0620                        "".split()])
0621     _dbk_subs.update([(x, _dbk_subs["_uipath"]) for x in
0622                        "menuchoice "
0623                        "".split()])
0624
0625     _dbk_ents = { # in addition to default XML entities
0626     }
0627
0628     _dbk_keepws = set("""
0629         screen programlisting
0630     """.split()) # FIXME: Add more.
0631
0632     _dbk_ignels = set([
0633     ])
0634
0635 def docbook4_to_plain (text):
0636     """
0637     Convert Docbook 4.x markup to plain text.
0638
0639     @param text: Docbook text to convert to plain
0640     @type text: string
0641
0642     @returns: plain text version
0643     @rtype: string
0644     """
0645
0646     if _dbk_tags is None:
0647         _prep_docbook4_to_plain()
0648
0649     return xml_to_plain(text, _dbk_tags, _dbk_subs, _dbk_ents,
0650                               _dbk_keepws, _dbk_ignels)
0651
0652
0653 def collect_xml_spec_l1 (specpath):
0654     """
0655     Collect lightweight XML format specification, level 1.
0656
0657     Level 1 specification is the dictionary of all known tags,
0658     with allowed attributes and subtags for each.
0659
0660     File of the level 1 specification is in the following format::
0661
0662         # A comment.
0663         # Tag with unconstrained attributes and subtags:
0664         tagA;
0665         # Tag with constrained attributes and unconstrained subtags:
0666         tagF : attr1 attr2 ...;
0667         # Tag with unconstrained attributes and constrained subtags:
0668         tagF > stag1 stag2 ...;
0669         # Tag with constrained attributes and subtags:
0670         tagF : attr1 attr2 ... > stag1 stag2 ...;
0671         # Tag with no attributes and unconstrained subtags:
0672         tagA :;
0673         # Tag with unconstrained attributes and no subtags:
0674         tagA >;
0675         # Tag with no attributes and no subtags:
0676         tagA :>;
0677         # Attribute value constrained by a regular expression:
0678         .... attr1=/^(val1|val2|val3)$/i ...
0679         # Reserved dummy tag specifying attributes common to all tags:
0680         pe-common-attrib : attrX attrY;
0681
0682     The specification can contain a dummy tag named C{pe-common-attrib},
0683     stating attributes which are common to all tags, instead of having to
0684     list them with each and every tag.
0685     To make an attribute mandatory, it's name should be prefixed by
0686     exclamation sign (!).
0687
0688     Specification file must be UTF-8 encoded.
0689
0690     @param specpath: path to level 1 specification file
0691     @type specpath: string
0692
0693     @return: level 1 specification
0694     @rtype: dict
0695     """
0696
0697     ch_comm = "#"
0698     ch_attr = ":"
0699     ch_attre = "="
0700     ch_mattr = "!"
0701     ch_stag = ">"
0702     ch_end = ";"
0703
0704     dtag_attr = "pe-common-attrib"
0705
0706     valid_tag_rx = re.compile(r"^[\w-]+$")
0707     valid_attr_rx = re.compile(r"^[\w-]+$")
0708
0709     c_tag, c_attr, c_attre, c_stag = list(range(4))
0710
0711     ifs = codecs.open(specpath, "r", "UTF-8").read()
0712     lenifs = len(ifs)
0713
0714     pos = [0, 1, 1]
0715
0716     def signal (msg, bpos):
0717
0718         emsg = _("@info \"L1-spec\" is shorthand for "
0719                  "\"level 1 specification\"",
0720                  "[L1-spec] %(file)s:%(line)d:%(col)d: %(msg)s",
0721                  file=specpath, line=bpos[0], col=bpos[1], msg=msg)
0722         raise PologyError(emsg)
0723
0724     def advance (stoptest, cmnt=True):
0725
0726         ind = pos[0]
0727         oind = ind
0728         substr = []
0729         sep = None
0730         while ind < lenifs and sep is None:
0731             if cmnt and ifs[ind] == ch_comm:
0732                 ind = ifs.find("\n", ind)
0733                 if ind < 0:
0734                     break
0735             else:
0736                 sep = stoptest(ind)
0737                 if sep is None:
0738                     substr.append(ifs[ind])
0739                     ind += 1
0740                 else:
0741                     ind += len(sep)
0742
0743         pos[0] = ind
0744         rawsubstr = ifs[oind:ind]
0745         p = rawsubstr.rfind("\n")
0746         if p >= 0:
0747             pos[1] += rawsubstr.count("\n")
0748             pos[2] = len(rawsubstr) - p
0749         else:
0750             pos[2] += len(rawsubstr)
0751
0752         return "".join(substr), sep
0753
0754     def make_rx_lint (rx_str, rx_flags, wch, lincol):
0755         try:
0756             rx = re.compile(rx_str, rx_flags)
0757         except:
0758             signal(_("@info the regex is already quoted when inserted",
0759                      "Cannot compile regular expression %(regex)s.",
0760                      regex=(wch + rx_str + wch)),
0761                      lincol)
0762         return lambda x: rx.search(x) is not None
0763
0764     spec = {}
0765     ctx = c_tag
0766     entry = None
0767     while pos[0] < lenifs:
0768         if ctx == c_tag:
0769             t = lambda i: (    ifs[i] in (ch_attr, ch_stag, ch_end)
0770                            and ifs[i] or None)
0771             tag, sep = advance(t)
0772             tag = tag.strip()
0773             if tag:
0774                 if sep is None:
0775                     signal(_("@info",
0776                              "Entry not terminated after the initial tag."),
0777                            lincol)
0778                 if not valid_tag_rx.search(tag) and tag != dtag_attr:
0779                     signal(_("@info",
0780                              "Invalid tag name '%(tag)s'.", tag=tag),
0781                              lincol)
0782                 entry = _L1Element(tag)
0783                 spec[tag] = entry
0784
0785             if sep == ch_attr:
0786                 ctx = c_attr
0787             elif sep == ch_stag:
0788                 ctx = c_stag
0789             elif sep == ch_end:
0790                 ctx = c_tag
0791             else:
0792                 break
0793
0794         elif ctx == c_attr:
0795             if entry.attrs is None:
0796                 entry.attrs = set()
0797
0798             lincol = tuple(pos[1:])
0799             t = lambda i: (    (   ifs[i].isspace()
0800                                 or ifs[i] in (ch_attre, ch_stag, ch_end))
0801                            and ifs[i] or [None])[0]
0802             attr, sep = advance(t)
0803             attr = attr.strip()
0804             if attr:
0805                 if attr.startswith(ch_mattr):
0806                     attr = attr[len(ch_mattr):]
0807                     entry.mattrs.add(attr)
0808                 if attr in entry.attrs:
0809                     signal(_("@info",
0810                              "Duplicate attribute '%(attr)s'.", attr=attr),
0811                              lincol)
0812                 if not valid_attr_rx.search(attr):
0813                     signal(_("@info",
0814                              "Invalid attribute name '%(attr)s'.", attr=attr),
0815                              lincol)
0816                 entry.attrs.add(attr)
0817                 lastattr = attr
0818
0819             if sep.isspace():
0820                 ctx = c_attr
0821             elif sep == ch_attre:
0822                 ctx = c_attre
0823             elif sep == ch_stag:
0824                 ctx = c_stag
0825             elif sep == ch_end:
0826                 ctx = c_tag
0827             else:
0828                 signal(_("@info",
0829                          "Entry not terminated after the attribute list."),
0830                        lincol)
0831
0832         elif ctx == c_attre:
0833             lincol = tuple(pos[1:])
0834             t = lambda i: not ifs[i].isspace() and ifs[i] or None
0835             sub, wch = advance(t)
0836             if wch is None:
0837                 signal(_("@info",
0838                          "End of input inside the value constraint."),
0839                        lincol)
0840             t = lambda i: ifs[i] == wch and ifs[i] or None
0841             rx_str, sep = advance(t, cmnt=False)
0842             if sep is None:
0843                 signal(_("@info",
0844                          "End of input inside the value constraint."),
0845                        lincol)
0846             t = lambda i: (not ifs[i].isalpha() and [""] or [None])[0]
0847             rx_flag_spec, sep = advance(t)
0848             rx_flags = re.U
0849             seen_flags = set()
0850             lincol = tuple(pos[1:])
0851             for c in rx_flag_spec:
0852                 if c in seen_flags:
0853                     signal(_("@info",
0854                              "Regex flag '%(flag)s' is already issued.",
0855                              flag=c), lincol)
0856                 if c == "i":
0857                     rx_flags |= re.I
0858                 else:
0859                     signal(_("@info",
0860                              "Unknown regex flag '%(flag)s'.", flag=c),
0861                              lincol)
0862                 seen_flags.add(c)
0863             entry.avlints[lastattr] = make_rx_lint(rx_str, rx_flags,
0864                                                    wch, lincol)
0865             ctx = c_attr
0866
0867         elif ctx == c_stag:
0868             if entry.stags is None:
0869                 entry.stags = set()
0870
0871             lincol = tuple(pos[1:])
0872             t = lambda i: (    (ifs[i].isspace() or ifs[i] == ch_end)
0873                            and ifs[i] or [None])[0]
0874             stag, sep = advance(t)
0875             stag = stag.strip()
0876             if stag:
0877                 if stag in entry.stags:
0878                     signal(_("@info",
0879                              "Repeated subtag '%(tag)s'.", tag=stag),
0880                              lincol)
0881                 entry.stags.add(stag)
0882
0883             if sep == ch_end:
0884                 ctx = c_tag
0885             else:
0886                 signal(_("@info",
0887                          "Entry not terminated after the subtag list."),
0888                        lincol)
0889
0890     # Add common attributes to each tag.
0891     dentry_attr = spec.pop(dtag_attr, [])
0892     if dentry_attr:
0893         for attr in dentry_attr.attrs:
0894             attre = dentry_attr.avlints.get(attr)
0895             for entry in list(spec.values()):
0896                 if entry.attrs is None:
0897                     entry.attrs = set()
0898                 if attr not in entry.attrs:
0899                     entry.attrs.add(attr)
0900                     if attre:
0901                         entry.avlints[attr] = attre
0902
0903     return spec
0904
0905
0906 class _L1Element:
0907
0908     def __init__ (self, tag=None, attrs=None, mattrs=None, avlints=None,
0909                   stags=None):
0910
0911         # The tag of this element (string).
0912         self.tag = tag
0913         # Possible attributes (set, or None meaning any).
0914         self.attrs = attrs
0915         # Mandatory attributes (set).
0916         self.mattrs = mattrs or set()
0917         # Validator functions for attribute values, per attribute (dict).
0918         # Validator does not have to be defined for each attribute.
0919         self.avlints = avlints or {}
0920         # Possible subelements by tag (set, or None meaning any).
0921         self.stags = stags
0922
0923
0924 # Simplified matching of XML entity name (sans ampersand and semicolon).
0925 _simple_ent_rx = re.compile(r"^([\w.:-]+|#[0-9]+)$", re.U);
0926
0927 # Get line/column segment in error report.
0928 _lin_col_rx = re.compile(r":\s*line\s*\d+,\s*column\s*\d+", re.I)
0929
0930 # Dummy top tag for topless texts.
0931 _dummy_top = "_"
0932
0933
0934 # Global data for XML checking.
0935 class _Global: pass
0936 _g_xml_l1 = _Global()
0937
0938 def validate_xml_l1 (text, spec=None, xmlfmt=None, ents=None,
0939                      casesens=True, accelamp=False):
0940     """
0941     Validate XML markup in text against L{level1<collect_xml_spec_l1>}
0942     specification.
0943
0944     Text is not required to have a top tag; if it does not, a dummy one will
0945     be assigned to assure that the check passes.
0946
0947     If C{spec} is C{None}, text is only checked to be well-formed.
0948
0949     If C{ents} are C{None}, entities in the text are ignored by the check;
0950     otherwise, an entity not belonging to the known set is considered erroneous.
0951     Default XML entities (C{&lt;}, C{&gt;}, C{&amp;}, C{&quot;}, C{&apos;})
0952     are automatically added to the set of known entities.
0953
0954     Tag and attribute names can be made case-insensitive by setting
0955     C{casesens} to C{False}.
0956
0957     If text is a part of user interface, and the environment may use
0958     the literal ampersand as accelerator marker, it can be allowed to pass
0959     the check by setting C{accelamp} to C{True}.
0960
0961     Text can be one or more entity definitions of the form C{<!ENTITY ...>},
0962     when special check is applied.
0963
0964     The result of the check is list of erroneous spans in the text,
0965     each given by start and end index (in Python standard semantics),
0966     and the error description, packed in a tuple.
0967     If there are no errors, empty list is returned.
0968     Reported spans need not be formally complete with respect to the error
0969     location, but are heuristically determined to be short and
0970     provide good visual indication of what triggers the error.
0971
0972     @param text: text to check
0973     @type text: string
0974     @param spec: markup definition
0975     @type spec: L{level1<collect_xml_spec_l1>} specification
0976     @param xmlfmt: name of the particular XML format (for error messages)
0977     @type xmlfmt: string
0978     @param ents: set of known entities
0979     @type ents: sequence
0980     @param casesens: whether tag names are case-insensitive
0981     @type casesens: bool
0982     @param accelamp: whether to allow ampersand as accelerator marker
0983     @type accelamp: bool
0984
0985     @returns: erroneous spans in the text
0986     @rtype: list of (int, int, string) tuples
0987     """
0988
0989     if text.lstrip().startswith("<!ENTITY"):
0990         return _validate_xml_entdef(text, xmlfmt)
0991
0992     # If ampersand accelerator marked allowed, replace one in non-entity
0993     # position with &amp;, to let the parser proceed.
0994     text_orig = text
0995     if accelamp:
0996         text = _escape_amp_accel(text)
0997
0998     # Make sure the text has a top tag.
0999     text = "<%s>%s</%s>" % (_dummy_top, text, _dummy_top)
1000
1001     # Prepare parser.
1002     xenc = "UTF-8"
1003     parser = xml.parsers.expat.ParserCreate(xenc)
1004     parser.UseForeignDTD() # not to barf on non-default XML entities
1005     parser.StartElementHandler = _handler_start_element
1006     parser.DefaultHandler = _handler_default
1007
1008     # Link state for handlers.
1009     g = _g_xml_l1
1010     g.text = text
1011     g.spec = spec
1012     g.xmlfmt = xmlfmt or "XML"
1013     g.ents = ents
1014     g.casesens = casesens
1015     g.xenc = xenc
1016     g.parser = parser
1017     g.errcnt = 0
1018     g.spans = []
1019     g.tagstack = []
1020
1021     # Parse and check.
1022     try:
1023         parser.Parse(text.encode(xenc), True)
1024     except xml.parsers.expat.ExpatError as e:
1025         errmsg = _("@info a problem in the given type of markup "
1026                    "(e.g. HTML, Docbook)",
1027                    "%(mtype)s markup: %(snippet)s.",
1028                    mtype=g.xmlfmt, snippet=e.args[0])
1029         span = _make_span(text, e.lineno, e.offset, errmsg)
1030         g.spans.append(span)
1031
1032     # Adapt spans back to original text.
1033     pure_spans = [x[:2] for x in g.spans]
1034     pure_spans = adapt_spans(text_orig, text, pure_spans, merge=False)
1035     # Remove unhelpful line/column in error messages.
1036     errmsgs = []
1037     for errmsg, span in zip([x[2] for x in g.spans], pure_spans):
1038         m = _lin_col_rx.search(errmsg)
1039         if m:
1040             errmsg = errmsg[:m.start()] + errmsg[m.end():]
1041         errmsgs.append(errmsg)
1042     # Put spans back together.
1043     g.spans = [x + (y,) for x, y in zip(pure_spans, errmsgs)]
1044
1045     return g.spans
1046
1047
1048 _ts_fence = "|/|"
1049
1050 def _escape_amp_accel (text):
1051
1052     p_ts = text.find(_ts_fence)
1053     in_script = False
1054
1055     p1 = 0
1056     found_accel = False
1057     while True:
1058
1059         # Bracket possible entity reference.
1060         p1 = text.find("&", p1)
1061         if p1 < 0:
1062             break
1063         if not in_script and p_ts >= 0 and p1 > p_ts:
1064             in_script = True
1065             found_accel = False
1066         p2 = text.find(";", p1)
1067
1068         # An accelerator marker if no semicolon in rest of the text
1069         # or the bracketed segment does not look like an entity,
1070         # and it is in front of an alphanumeric or itself.
1071         nc = text[p1 + 1:p1 + 2]
1072         if (    (p2 < 0 or not _simple_ent_rx.match(text[p1 + 1:p2]))
1073             and (nc.isalnum() or nc == "&")
1074         ):
1075             # Check if the next one is an ampersand too,
1076             # i.e. if it's a self-escaped accelerator marker.
1077             namp = 1
1078             if (    text[p1 + 1:p1 + 2] == "&"
1079                 and not _simple_ent_rx.match(text[p1 + 2:p2])
1080             ):
1081                 namp += 1
1082
1083             # Escape the marker if first or self-escaped,
1084             # or currently in scripted part (in which there can be
1085             # any number of non-escaped markers).
1086             if not found_accel or namp > 1 or in_script:
1087                 escseg = "&amp;" * namp
1088                 text = text[:p1] + escseg + text[p1 + namp:]
1089                 p1 += len(escseg)
1090                 if namp == 1:
1091                     found_accel = True
1092             else:
1093                 p1 += namp
1094
1095         elif p2 > p1:
1096             p1 = p2
1097         else:
1098             break
1099
1100     return text
1101
1102
1103 def _handler_start_element (tag, attrs):
1104
1105     g = _g_xml_l1
1106
1107     if g.spec is None:
1108         return
1109
1110     # Normalize names to lower case if allowed.
1111     if not g.casesens:
1112         tag = tag.lower()
1113         attrs = dict([(x.lower(), y) for x, y in list(attrs.items())])
1114
1115     # Check existence of the tag.
1116     if tag not in g.spec and tag != _dummy_top:
1117         errmsg = _("@info",
1118                    "%(mtype)s markup: unrecognized tag '%(tag)s'.",
1119                    mtype=g.xmlfmt, tag=tag)
1120         span = _make_span(g.text, g.parser.CurrentLineNumber,
1121                           g.parser.CurrentColumnNumber + 1, errmsg)
1122         g.spans.append(span)
1123         return
1124
1125     if tag == _dummy_top:
1126         return
1127
1128     elspec = g.spec[tag]
1129     errmsgs = []
1130
1131     # Check applicability of attributes and validity of their values.
1132     if elspec.attrs is not None:
1133         for attr, aval in list(attrs.items()):
1134             if attr not in elspec.attrs:
1135                 errmsgs.append(_("@info",
1136                                  "%(mtype)s markup: invalid attribute "
1137                                  "'%(attr)s' to tag '%(tag)s'.",
1138                                  mtype=g.xmlfmt, attr=attr, tag=tag))
1139             else:
1140                 avlint = elspec.avlints.get(attr)
1141                 if avlint and not avlint(aval):
1142                     errmsgs.append(_("@info",
1143                                      "%(mtype)s markup: invalid value "
1144                                      "'%(val)s' to attribute '%(attr)s'.",
1145                                      mtype=g.xmlfmt, val=aval, attr=attr))
1146
1147     # Check presence of mandatory attributes.
1148     if elspec.mattrs is not None:
1149         for attr in elspec.mattrs:
1150             if attr not in attrs:
1151                 errmsgs.append(_("@info",
1152                                  "%(mtype)s markup: missing mandatory attribute "
1153                                  "'%(attr)s' to tag '%(tag)s'.",
1154                                  mtype=g.xmlfmt, attr=attr, tag=tag))
1155
1156     # Check proper parentage.
1157     if g.tagstack:
1158         ptag = g.tagstack[-1]
1159         pelspec = g.spec.get(ptag)
1160         if (    pelspec is not None and pelspec.stags is not None
1161             and tag not in pelspec.stags
1162         ):
1163             errmsgs.append(_("@info",
1164                              "%(mtype)s markup: tag '%(tag1)s' cannot be "
1165                              "a subtag of '%(tag2)s'.",
1166                              mtype=g.xmlfmt, tag1=tag, tag2=ptag))
1167
1168     # Record element stack.
1169     g.tagstack.append(tag)
1170
1171     for errmsg in errmsgs:
1172         span = _make_span(g.text, g.parser.CurrentLineNumber,
1173                           g.parser.CurrentColumnNumber + 1, errmsg)
1174         g.spans.append(span)
1175
1176
1177 def _handler_default (text):
1178
1179     g = _g_xml_l1
1180
1181     if g.ents is not None and text.startswith('&') and text.endswith(';'):
1182         ent = text[1:-1]
1183         errmsg = None
1184         if ent.startswith("#"):
1185             if nument_to_char(ent) is None:
1186                 errmsg = _("@info",
1187                            "%(mtype)s markup: invalid numeric "
1188                            "entity '%(ent)s'.",
1189                            mtype=g.xmlfmt, ent=ent)
1190         elif ent not in g.ents and ent not in xml_entities:
1191             nearents = [] #difflib.get_close_matches(ent, g.ents)
1192             if nearents:
1193                 if len(nearents) > 5: # do not overwhelm message
1194                     fmtents = format_item_list(nearents[:5], incmp=True)
1195                 else:
1196                     fmtents = format_item_list(nearents)
1197                 errmsg = _("@info",
1198                            "%(mtype)s markup: unknown entity '%(ent)s' "
1199                            "(suggestions: %(entlist)s).",
1200                            mtype=g.xmlfmt, ent=ent, entlist=fmtents)
1201             else:
1202                 errmsg = _("@info",
1203                            "%(mtype)s markup: unknown entity '%(ent)s'.",
1204                            mtype=g.xmlfmt, ent=ent)
1205
1206         if errmsg is not None:
1207             span = _make_span(g.text, g.parser.CurrentLineNumber,
1208                               g.parser.CurrentColumnNumber + 1, errmsg)
1209             g.spans.append(span)
1210
1211
1212 # Text to fetch from the reported error position in XML stream.
1213 _near_xml_error_rx = re.compile(r"\W*[\w:.-]*[^\w\s>]*(\s*>)?", re.U)
1214
1215 def _make_span (text, lno, col, errmsg):
1216
1217     # Find problematic position.
1218     clno = 1
1219     p = 0
1220     while clno < lno:
1221         p = text.find("\n", p)
1222         if p < 0:
1223             break
1224         p += 1
1225         clno += 1
1226     if p < 0:
1227         return (0, len(text))
1228
1229     # Scoop some reasonable nearby text.
1230     m = _near_xml_error_rx.match(text, p + col - 1)
1231     if not m:
1232         return (0, len(text), errmsg)
1233     start, end = m.span()
1234     while text[start].isalnum():
1235         if start == 0:
1236             break
1237         start -= 1
1238
1239     return (start, end, errmsg)
1240
1241
1242 _entname_rx = re.compile(r"^([\w:][\w\d.:-]*)$", re.U)
1243
1244 def _validate_xml_entdef (text, xmlfmt):
1245
1246     state = "void"
1247     pos = 0
1248     tlen = len(text)
1249     errmsg = None
1250     dhead = "!ENTITY"
1251     def next_nws (pos):
1252         while pos < tlen and text[pos].isspace():
1253             pos += 1
1254         return pos
1255     def next_ws (pos, ows=()):
1256         while pos < tlen and not text[pos].isspace() and text[pos] not in ows:
1257             pos += 1
1258         return pos
1259     errend = lambda: (_("@info",
1260                         "%(mtype)s markup: premature end of entity definition.",
1261                         mtype=xmlfmt),
1262                       tlen)
1263     while True:
1264         if state == "void":
1265             pos = next_nws(pos)
1266             if pos == tlen:
1267                 break
1268             elif text[pos] != "<":
1269                 errmsg = _("@info",
1270                            "%(mtype)s markup: expected opening angle bracket "
1271                            "in entity definition.",
1272                            mtype=xmlfmt)
1273                 pos1 = pos + 1
1274             else:
1275                 pos += 1
1276                 state = "head"
1277
1278         elif state == "head":
1279             pos = next_nws(pos)
1280             if pos == tlen:
1281                 errmsg, pos1 = errend()
1282             else:
1283                 pos1 = next_ws(pos)
1284                 head = text[pos:pos1]
1285                 if head != dhead:
1286                     errmsg = _("@info",
1287                                "%(mtype)s markup: expected '%(keyword)s' "
1288                                "in entity definition.",
1289                                mtype=xmlfmt, keyword=dhead)
1290                 else:
1291                     pos = pos1
1292                     state = "name"
1293
1294         elif state == "name":
1295             pos = next_nws(pos)
1296             pos1 = next_ws(pos, ("'", "\""))
1297             name = text[pos:pos1]
1298             if not _entname_rx.match(name):
1299                 errmsg = _("@info",
1300                            "%(mtype)s markup: invalid entity name '%(name)s' "
1301                            "in entity definition.",
1302                            mtype=xmlfmt, name=name)
1303             else:
1304                 pos = pos1
1305                 state = "value"
1306
1307         elif state == "value":
1308             pos = next_nws(pos)
1309             if pos == tlen:
1310                 errmsg, pos1 = errend()
1311             elif text[pos] not in ("'", "\""):
1312                 errmsg = _("@info",
1313                            "%(mtype)s markup: expected opening quote "
1314                            "(ASCII single or double) in entity definition.",
1315                            mtype=xmlfmt)
1316                 pos1 = pos + 1
1317             else:
1318                 quote = text[pos]
1319                 pos1 = text.find(quote, pos + 1)
1320                 if pos1 < 0:
1321                     errmsg = _("@info",
1322                                "%(mtype)s markup: unclosed entity value "
1323                                "in entity definition.",
1324                                mtype=xmlfmt)
1325                     pos1 = tlen
1326                 else:
1327                     value = text[pos + 1:pos1]
1328                     # FIXME: Validate value? Does not have to be valid
1329                     # on its own, in principle.
1330                     pos = pos1 + 1
1331                     state = "tail"
1332
1333         elif state == "tail":
1334             pos = next_nws(pos)
1335             if pos == tlen:
1336                 errmsg, pos1 = errend()
1337             elif text[pos] != ">":
1338                 errmsg = _("@info",
1339                            "%(mtype)s markup: expected closing angle bracket "
1340                            "in entity definition.",
1341                            mtype=xmlfmt)
1342                 pos1 = pos + 1
1343             else:
1344                 pos += 1
1345                 state = "void"
1346
1347         if errmsg:
1348             break
1349
1350     spans = []
1351     if errmsg:
1352         if pos1 is None:
1353             pos1 = pos
1354         spans = [(pos, pos1, errmsg)]
1355
1356     return spans
1357
1358
1359 def check_xml (strict=False, entities={}, mkeyw=None):
1360     """
1361     Check general XML markup in translation [hook factory].
1362
1363     Text is only checked to be well-formed XML, and possibly also whether
1364     encountered entities are defined. Markup errors are reported to stdout.
1365
1366     C{msgstr} can be either checked only if the C{msgid} is valid itself,
1367     or regardless of the validity of the original. This is governed by the
1368     C{strict} parameter.
1369
1370     Entities in addition to XML's default (C{&lt;}, etc.)
1371     may be provided using the C{entities} parameter.
1372     Several types of values with different semantic are possible:
1373       - if C{entities} is C{None}, unknown entities are ignored on checking
1374       - if string, it is understood as a general function evaluation
1375         L{request<getfunc.get_result_ireq>},
1376         and its result expected to be (name, value) dictionary-like object
1377       - otherwise, C{entities} is considered to be a (name, value) dictionary
1378
1379     If a message has L{sieve flag<pology.sieve.parse_sieve_flags>}
1380     C{no-check-markup}, the check is skipped for that message.
1381     If one or several markup keywords are given as C{mkeyw} parameter,
1382     check is skipped for all messages in a catalog which does not report
1383     one of the given keywords by its L{markup()<catalog.Catalog.markup>}
1384     method. See L{set_markup()<catalog.Catalog.set_markup>} for list of
1385     markup keywords recognized at the moment.
1386
1387     @param strict: whether to require valid C{msgstr} even if C{msgid} is not
1388     @type strict: bool
1389     @param entities: additional entities to consider as known
1390     @type entities: C{None}, dict, or string
1391     @param mkeyw: markup keywords for taking catalogs into account
1392     @type mkeyw: string or list of strings
1393
1394     @return: type S3C hook
1395     @rtype: C{(msgstr, msg, cat) -> numerr}
1396     """
1397
1398     return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, False)
1399
1400
1401 def check_xml_sp (strict=False, entities={}, mkeyw=None):
1402     """
1403     Like L{check_xml}, except that erroneous spans are returned
1404     instead of reporting problems to stdout [hook factory].
1405
1406     @return: type V3C hook
1407     @rtype: C{(msgstr, msg, cat) -> spans}
1408     """
1409
1410     return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, True)
1411
1412
1413 # Worker for C{check_xml*} hook factories.
1414 def _check_xml_w (check, strict, entities, mkeyw, spanrep,
1415                   ignctxt=(), ignid=(), ignctxtsw=(), ignidsw=()):
1416
1417     if mkeyw is not None:
1418         if isinstance(mkeyw, str):
1419             mkeyw = [mkeyw]
1420         mkeyw = set(mkeyw)
1421
1422     # Lazy-evaluated data.
1423     ldata = {}
1424     def eval_ldata ():
1425         ldata["entities"] = _get_entities(entities)
1426
1427     def checkf (msgstr, msg, cat):
1428
1429         if (    mkeyw is not None
1430             and not mkeyw.intersection(cat.markup() or set())
1431         ):
1432             return [] if spanrep else 0
1433
1434         if (   msg.msgctxt in ignctxt
1435             or msg.msgid in ignid
1436             or (msg.msgctxt is not None and msg.msgctxt.startswith(ignctxtsw))
1437             or msg.msgid.startswith(ignidsw)
1438         ):
1439             return [] if spanrep else 0
1440
1441         if not ldata:
1442             eval_ldata()
1443         entities = ldata["entities"]
1444
1445         if (   flag_no_check_markup in manc_parse_flag_list(msg, "|")
1446             or (    not strict
1447                 and (   check(msg.msgid, ents=entities)
1448                      or check(msg.msgid_plural or "", ents=entities)))
1449         ):
1450             return [] if spanrep else 0
1451         spans = check(msgstr, ents=entities)
1452         if spanrep:
1453             return spans
1454         else:
1455             for span in spans:
1456                 if span[2:]:
1457                     report_on_msg(span[2], msg, cat)
1458             return len(spans)
1459
1460     return checkf
1461
1462
1463 # Cache for loaded entities, by entity specification string,
1464 # to speed up when several markup hooks are using the same setup.
1465 _loaded_entities_cache = {}
1466
1467 def _get_entities (entspec):
1468
1469     if not isinstance(entspec, str):
1470         return entspec
1471
1472     entities = _loaded_entities_cache.get(entspec)
1473     if entities is not None:
1474         return entities
1475
1476     entities = get_result_ireq(entspec)
1477
1478     _loaded_entities_cache[entspec] = entities
1479     return entities
1480
1481
1482 _docbook4_l1 = None
1483
1484 def validate_docbook4_l1 (text, ents=None):
1485     """
1486     Validate Docbook 4.x markup in text against L{level1<collect_xml_spec_l1>}
1487     specification.
1488
1489     Markup definition is extended to include C{<placeholder-N/>} elements,
1490     which C{xml2po} uses to segment text when extracting markup documents
1491     into PO templates.
1492
1493     See L{validate_xml_l1} for description of the C{ents} parameter
1494     and the return value.
1495
1496     @param text: text to check
1497     @type text: string
1498     @param ents: set of known entities (in addition to default)
1499     @type ents: sequence
1500
1501     @returns: erroneous spans in the text
1502     @rtype: list of (int, int, string) tuples
1503     """
1504
1505     global _docbook4_l1
1506     if _docbook4_l1 is None:
1507         specpath = os.path.join(datadir(), "spec", "docbook4.l1")
1508         _docbook4_l1 = collect_xml_spec_l1(specpath)
1509
1510     xmlfmt = _("@item markup type", "Docbook4")
1511     return validate_xml_l1(text, spec=_docbook4_l1, xmlfmt=xmlfmt, ents=ents)
1512
1513
1514 _db4_meta_msgctxt = set((
1515 ))
1516 _db4_meta_msgid = set((
1517     "translator-credits",
1518 ))
1519 _db4_meta_msgid_sw = (
1520     "@@image:",
1521 )
1522
1523 def check_docbook4 (strict=False, entities={}, mkeyw=None):
1524     """
1525     Check XML markup in translations of Docbook 4.x catalogs [hook factory].
1526
1527     See L{check_xml} for description of parameters.
1528
1529     @return: type S3C hook
1530     @rtype: C{(msgstr, msg, cat) -> numerr}
1531     """
1532
1533     return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, False,
1534                         ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt,
1535                         ignidsw=_db4_meta_msgid_sw)
1536
1537
1538 def check_docbook4_sp (strict=False, entities={}, mkeyw=None):
1539     """
1540     Like L{check_docbook4}, except that erroneous spans are returned
1541     instead of reporting problems to stdout [hook factory].
1542
1543     @return: type V3C hook
1544     @rtype: C{(msgstr, msg, cat) -> spans}
1545     """
1546
1547     return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, True,
1548                         ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt,
1549                         ignidsw=_db4_meta_msgid_sw)
1550
1551
1552 def check_docbook4_msg (strict=False, entities={}, mkeyw=None):
1553     """
1554     Check for any known problem in translation in messages
1555     in Docbook 4.x catalogs [hook factory].
1556
1557     Currently performed checks:
1558       - Docbook markup
1559       - cross-message insertion placeholders
1560
1561     See L{check_xml} for description of parameters.
1562
1563     @return: type V4A hook
1564     @rtype: C{(msg, cat) -> parts}
1565     """
1566
1567     check_markup = check_docbook4_sp(strict, entities, mkeyw)
1568
1569     def checkf (msg, cat):
1570
1571         hl = []
1572         for i in range(len(msg.msgstr)):
1573             spans = []
1574             spans.extend(check_markup(msg.msgstr[i], msg, cat))
1575             spans.extend(check_placeholder_els(msg.msgid, msg.msgstr[i]))
1576             if spans:
1577                 hl.append(("msgstr", i, spans))
1578         return hl
1579
1580     return checkf
1581
1582
1583 _entpath_html = os.path.join(datadir(), "spec", "html.entities")
1584 html_entities = read_entities(_entpath_html)
1585
1586 _html_l1 = None
1587
1588 def validate_html_l1 (text, ents=None):
1589     """
1590     Validate HTML markup in text against L{level1<collect_xml_spec_l1>}
1591     specification.
1592
1593     At the moment, this function can only check HTML markup if well-formed
1594     in the XML sense, although HTML allows omission of some closing tags.
1595
1596     See L{validate_xml_l1} for description of the C{ents} parameter
1597     and the return value.
1598
1599     @param text: text to check
1600     @type text: string
1601     @param ents: set of known entities (in addition to default)
1602     @type ents: sequence
1603
1604     @returns: erroneous spans in the text
1605     @rtype: list of (int, int, string) tuples
1606     """
1607
1608     global _html_l1
1609     if _html_l1 is None:
1610         specpath = os.path.join(datadir(), "spec", "html.l1")
1611         _html_l1 = collect_xml_spec_l1(specpath)
1612
1613     if ents is not None:
1614         ents = Multidict([ents, html_entities])
1615
1616     xmlfmt = _("@item markup type", "HTML")
1617     return validate_xml_l1(text, spec=_html_l1, xmlfmt=xmlfmt, ents=ents,
1618                            accelamp=True, casesens=False)
1619
1620
1621 def check_html (strict=False, entities={}, mkeyw=None):
1622     """
1623     Check HTML markup in translations [hook factory].
1624
1625     See L{check_xml} for description of parameters.
1626     See notes on checking HTML markup to L{validate_html_l1}.
1627
1628     @return: type S3C hook
1629     @rtype: C{(msgstr, msg, cat) -> numerr}
1630     """
1631
1632     return _check_xml_w(validate_html_l1, strict, entities, mkeyw, False)
1633
1634
1635 def check_html_sp (strict=False, entities={}, mkeyw=None):
1636     """
1637     Like L{check_html}, except that erroneous spans are returned
1638     instead of reporting problems to stdout [hook factory].
1639
1640     @return: type V3C hook
1641     @rtype: C{(msgstr, msg, cat) -> spans}
1642     """
1643
1644     return _check_xml_w(validate_html_l1, strict, entities, mkeyw, True)
1645
1646
1647 _qtrich_l1 = None
1648
1649 def validate_qtrich_l1 (text, ents=None):
1650     """
1651     Validate Qt rich-text markup in text against L{level1<collect_xml_spec_l1>}
1652     specification.
1653
1654     At the moment, this function can only check Qt rich-text if well-formed
1655     in the XML sense, although Qt rich-text allows HTML-type omission of
1656     closing tags.
1657
1658     See L{validate_xml_l1} for description of the C{ents} parameter
1659     and the return value.
1660
1661     @param text: text to check
1662     @type text: string
1663     @param ents: set of known entities (in addition to default)
1664     @type ents: sequence
1665
1666     @returns: erroneous spans in the text
1667     @rtype: list of (int, int, string) tuples
1668     """
1669
1670     global _qtrich_l1
1671     if _qtrich_l1 is None:
1672         specpath = os.path.join(datadir(), "spec", "qtrich.l1")
1673         _qtrich_l1 = collect_xml_spec_l1(specpath)
1674
1675     if ents is not None:
1676         ents = Multidict([ents, html_entities])
1677
1678     xmlfmt = _("@item markup type", "Qt-rich")
1679     return validate_xml_l1(text, spec=_qtrich_l1, xmlfmt=xmlfmt, ents=ents,
1680                            accelamp=True, casesens=False)
1681
1682
1683 def check_qtrich (strict=False, entities={}, mkeyw=None):
1684     """
1685     Check Qt rich-text markup in translations [hook factory].
1686
1687     See L{check_xml} for description of parameters.
1688     See notes on checking Qt rich-text to L{validate_qtrich_l1}.
1689
1690     @return: type S3C hook
1691     @rtype: C{(msgstr, msg, cat) -> numerr}
1692     """
1693
1694     return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, False)
1695
1696
1697 def check_qtrich_sp (strict=False, entities={}, mkeyw=None):
1698     """
1699     Like L{check_qtrich}, except that erroneous spans are returned
1700     instead of reporting problems to stdout [hook factory].
1701
1702     @return: type V3C hook
1703     @rtype: C{(msgstr, msg, cat) -> spans}
1704     """
1705
1706     return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, True)
1707
1708
1709 _entpath_kuit = os.path.join(datadir(), "spec", "kuit.entities")
1710 kuit_entities = read_entities(_entpath_kuit)
1711
1712 _kuit_l1 = None
1713
1714 def validate_kuit_l1 (text, ents=None):
1715     """
1716     Validate KUIT markup in text against L{level1<collect_xml_spec_l1>}
1717     specification.
1718
1719     KUIT is the semantic markup for user interface in KDE4.
1720
1721     See L{validate_xml_l1} for description of the C{ents} parameter
1722     and the return value.
1723
1724     @param text: text to check
1725     @type text: string
1726     @param ents: set of known entities (in addition to default)
1727     @type ents: sequence
1728
1729     @returns: erroneous spans in the text
1730     @rtype: list of (int, int, string) tuples
1731     """
1732
1733     global _kuit_l1
1734     if _kuit_l1 is None:
1735         specpath = os.path.join(datadir(), "spec", "kuit.l1")
1736         _kuit_l1 = collect_xml_spec_l1(specpath)
1737
1738     if ents is not None:
1739         ents = Multidict([ents, kuit_entities])
1740
1741     xmlfmt = _("@item markup type", "KUIT")
1742     return validate_xml_l1(text, spec=_kuit_l1, xmlfmt=xmlfmt, ents=ents,
1743                            accelamp=True)
1744
1745
1746 _kde4_l1 = None
1747 _kde4_ents = None
1748
1749 def validate_kde4_l1 (text, ents=None):
1750     """
1751     Validate markup in texts used in KDE4 GUI.
1752
1753     KDE4 GUI texts may contain both Qt rich-text and KUIT markup,
1754     even mixed in the same text.
1755
1756     See L{validate_xml_l1} for description of the C{ents} parameter
1757     and the return value.
1758
1759     @param text: text to check
1760     @type text: string
1761     @param ents: set of known entities (in addition to default)
1762     @type ents: sequence
1763
1764     @returns: erroneous spans in the text
1765     @rtype: list of (int, int, string) tuples
1766     """
1767
1768     global _kde4_l1, _kde4_ents
1769     if _kde4_l1 is None:
1770         _kde4_l1 = {}
1771         spath1 = os.path.join(datadir(), "spec", "qtrich.l1")
1772         _kde4_l1.update(collect_xml_spec_l1(spath1))
1773         spath2 = os.path.join(datadir(), "spec", "kuit.l1")
1774         _kde4_l1.update(collect_xml_spec_l1(spath2))
1775         _kde4_ents = {}
1776         _kde4_ents.update(html_entities)
1777         _kde4_ents.update(kuit_entities)
1778
1779     if ents is not None:
1780         ents = Multidict([ents, _kde4_ents])
1781
1782     xmlfmt = _("@item markup type", "KDE4")
1783     return validate_xml_l1(text, spec=_kde4_l1, xmlfmt=xmlfmt, ents=ents,
1784                            accelamp=True, casesens=False)
1785
1786
1787 def check_kde4 (strict=False, entities={}, mkeyw=None):
1788     """
1789     Check XML markup in translations of KDE4 UI catalogs [hook factory].
1790
1791     See L{check_xml} for description of parameters.
1792
1793     @return: type S3C hook
1794     @rtype: C{(msgstr, msg, cat) -> numerr}
1795     """
1796
1797     return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, False)
1798
1799
1800 def check_kde4_sp (strict=False, entities={}, mkeyw=None):
1801     """
1802     Like L{check_kde4}, except that erroneous spans are returned
1803     instead of reporting problems to stdout [hook factory].
1804
1805     @return: type V3C hook
1806     @rtype: C{(msgstr, msg, cat) -> spans}
1807     """
1808
1809     return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, True)
1810
1811
1812 _pango_l1 = None
1813
1814 def validate_pango_l1 (text, ents=None):
1815     """
1816     Validate Pango markup in text against L{level1<collect_xml_spec_l1>}
1817     specification.
1818
1819     See L{validate_xml_l1} for description of the C{ents} parameter
1820     and the return value.
1821
1822     @param text: text to check
1823     @type text: string
1824     @param ents: set of known entities (in addition to default)
1825     @type ents: sequence
1826
1827     @returns: erroneous spans in the text
1828     @rtype: list of (int, int, string) tuples
1829     """
1830
1831     global _pango_l1
1832     if _pango_l1 is None:
1833         specpath = os.path.join(datadir(), "spec", "pango.l1")
1834         _pango_l1 = collect_xml_spec_l1(specpath)
1835
1836     if ents is not None:
1837         ents = Multidict([ents, html_entities])
1838
1839     xmlfmt = _("@item markup type", "Pango")
1840     return validate_xml_l1(text, spec=_pango_l1, xmlfmt=xmlfmt, ents=ents,
1841                            accelamp=True, casesens=False)
1842
1843
1844 def check_pango (strict=False, entities={}, mkeyw=None):
1845     """
1846     Check XML markup in translations of Pango UI catalogs [hook factory].
1847
1848     See L{check_xml} for description of parameters.
1849
1850     @return: type S3C hook
1851     @rtype: C{(msgstr, msg, cat) -> numerr}
1852     """
1853
1854     return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, False)
1855
1856
1857 def check_pango_sp (strict=False, entities={}, mkeyw=None):
1858     """
1859     Like L{check_pango}, except that erroneous spans are returned
1860     instead of reporting problems to stdout [hook factory].
1861
1862     @return: type V3C hook
1863     @rtype: C{(msgstr, msg, cat) -> spans}
1864     """
1865
1866     return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, True)
1867
1868
1869
1870
1871 _digits_dec = set("0123456789")
1872 _digits_hex = set("0123456789abcdefABCDEF")
1873
1874 def nument_to_char (nument):
1875     """
1876     Convert numeric XML entity to character.
1877
1878     Numeric XML entities can be decimal, C{&#DDDD;}, or hexadecimal,
1879     C{&#xHHHH;}, where C{D} and C{H} stand for number system's digits.
1880     4 digits is the maximum, but there can be less.
1881
1882     If the entity cannot be converted to a character, for whatever reason,
1883     C{None} is reported.
1884
1885     @param nument: numeric entity, with or without C{&} and C{;}
1886     @type nument: string
1887
1888     @return: character represented by the entity
1889     @rtype: string or None
1890     """
1891
1892     if nument[:1] == "&":
1893         nument = nument[1:-1]
1894
1895     if nument[:1] != "#":
1896         return None
1897
1898     if nument[1:2] == "x":
1899         known_digits = _digits_hex
1900         numstr = nument[2:]
1901         base = 16
1902     else:
1903         known_digits = _digits_dec
1904         numstr = nument[1:]
1905         base = 10
1906
1907     if len(numstr) > 4 or len(numstr) < 1:
1908         return None
1909
1910     unknown_digits = set(numstr).difference(known_digits)
1911     if unknown_digits:
1912         return None
1913
1914     return chr(int(numstr, base))
1915
1916
1917 def validate_xmlents (text, ents={}, default=False, numeric=False):
1918     """
1919     Check whether XML-like entities in the text are among known.
1920
1921     The text does not have to be XML markup as such.
1922     No XML parsing is performed, only the raw search for XML-like entities.
1923
1924     @param text: text with entities to check
1925     @type text: string
1926     @param ents: known entities
1927     @type ents: sequence
1928     @param default: whether default XML entities are allowed (C{&amp;}, etc.)
1929     @type default: bool
1930     @param numeric: whether numeric character entities are allowed
1931     @type numeric: bool
1932
1933     @returns: erroneous spans in the text
1934     @rtype: list of (int, int, string) tuples
1935     """
1936
1937     spans = []
1938
1939     p = 0
1940     while True:
1941         p = text.find("&", p)
1942         if p < 0:
1943             break
1944         pp = p
1945         m = _entity_rx.match(text, p)
1946         if m:
1947             p = m.end()
1948             ent = m.group(1)
1949             errmsg = None
1950             if numeric and ent.startswith("#"):
1951                 if nument_to_char(ent) is None:
1952                     errmsg = _("@info",
1953                                "Invalid numeric entity '%(ent)s'.",
1954                                ent=ent)
1955             elif ent not in ents and (not default or ent not in xml_entities):
1956                 nearents = [] #difflib.get_close_matches(ent, ents)
1957                 if nearents:
1958                     if len(nearents) > 5: # do not overwhelm message
1959                         fmtents = format_item_list(nearents[:5], incmp=True)
1960                     else:
1961                         fmtents = format_item_list(nearents)
1962                     errmsg = _("@info",
1963                                "Unknown entity '%(ent)s' "
1964                                "(suggestions: %(entlist)s).",
1965                                ent=ent, entlist=fmtents)
1966                 else:
1967                     errmsg = _("@info",
1968                                "Unknown entity '%(ent)s'.",
1969                                ent=ent)
1970
1971             if errmsg is not None:
1972                 spans.append((pp, p, errmsg))
1973         else:
1974             p += 1
1975
1976     return spans
1977
1978
1979 def check_xmlents (strict=False, entities={}, mkeyw=None,
1980                    default=False, numeric=False):
1981     """
1982     Check existence of XML entities in translations [hook factory].
1983
1984     See L{check_xml} for description of parameters C{strict}, C{entities},
1985     and C{mkeyw}. See L{validate_xmlents} for parameters C{default} and
1986     C{numeric}, and for general notes on checking entities.
1987
1988     @return: type S3C hook
1989     @rtype: C{(msgstr, msg, cat) -> numerr}
1990     """
1991
1992     def check (text, ents):
1993         return validate_xmlents(text, ents, default=default, numeric=numeric)
1994
1995     return _check_xml_w(check, strict, entities, mkeyw, False)
1996
1997
1998 def check_xmlents_sp (strict=False, entities={}, mkeyw=None,
1999                       default=False, numeric=False):
2000     """
2001     Like L{check_xmlents}, except that erroneous spans are returned
2002     instead of reporting problems to stdout [hook factory].
2003
2004     @return: type V3C hook
2005     @rtype: C{(msgstr, msg, cat) -> spans}
2006     """
2007
2008     def check (text, ents):
2009         return validate_xmlents(text, ents, default=default, numeric=numeric)
2010
2011     return _check_xml_w(check, strict, entities, mkeyw, True)
2012
2013
2014 _placeholder_el_rx = re.compile(r"<\s*placeholder-(\d+)\s*/\s*>")
2015
2016 def check_placeholder_els (orig, trans):
2017     """
2018     Check if sets of C{<placeholder-N/>} elements are matching between
2019     original and translated text.
2020
2021     C{<placeholder-N/>} elements are added into text by C{xml2po},
2022     for finer segmentation of markup documents extracted into PO templates.
2023
2024     See L{validate_xml_l1} for description of the return value.
2025
2026     @param orig: original text
2027     @type orig: string
2028     @param trans: translated text
2029     @type trans: string
2030
2031     @returns: erroneous spans in translation
2032     @rtype: list of (int, int, string) tuples
2033     """
2034
2035     spans = []
2036
2037     orig_plnums = set()
2038     for m in _placeholder_el_rx.finditer(orig):
2039         orig_plnums.add(m.group(1))
2040     trans_plnums = set()
2041     for m in _placeholder_el_rx.finditer(trans):
2042         trans_plnums.add(m.group(1))
2043
2044     missing_plnums = list(orig_plnums.difference(trans_plnums))
2045     extra_plnums = list(trans_plnums.difference(orig_plnums))
2046     if missing_plnums:
2047         tags = "".join(["<placeholder-%s/>" % x for x in missing_plnums])
2048         errmsg = _("@info",
2049                    "Missing placeholder tags in translation: %(taglist)s.",
2050                    taglist=format_item_list(tags))
2051         spans.append((0, 0, errmsg))
2052     elif extra_plnums: # do not report both, single glitch may cause them
2053         tags = "".join(["<placeholder-%s/>" % x for x in extra_plnums])
2054         errmsg = _("@info",
2055                    "Superfluous placeholder tags in translation: %(taglist)s.",
2056                    taglist=format_item_list(tags))
2057         spans.append((0, 0, errmsg))
2058
2059     return spans
2060