File indexing completed on 2024-04-14 05:37:51

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Convert and validate markup in text.
0005 
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009 
0010 import os
0011 import re
0012 import codecs
0013 import xml.parsers.expat
0014 import difflib
0015 
0016 from pology import PologyError, datadir, _, n_
0017 from pology.comments import manc_parse_flag_list
0018 from pology.diff import adapt_spans
0019 from pology.entities import read_entities
0020 from pology.getfunc import get_result_ireq
0021 from pology.msgreport import report_on_msg
0022 from pology.multi import Multidict
0023 from pology.report import format_item_list
0024 
0025 
0026 # Pipe flag used to manually prevent check for a particular message.
0027 flag_no_check_markup = "no-check-markup"
0028 
0029 
0030 _nlgr_rx = re.compile(r"\n{2,}")
0031 _wsgr_rx = re.compile(r"\s+", re.ASCII)
0032 
0033 def plain_to_unwrapped (text):
0034     """
0035     Convert wrapped plain text to unwrapped.
0036 
0037     Two or more newlines are considered as paragraph boundaries and left in,
0038     while all other newlines are removed.
0039     Whitespace in the text is simplified throughout.
0040 
0041     @param text: text to unwrap
0042     @type text: string
0043 
0044     @returns: unwrapped text
0045     @rtype: string
0046     """
0047 
0048     # Strip leading and trailing whitespace.
0049     text = text.strip()
0050 
0051     # Strip leading and trailing whitespace in all lines.
0052     text = "\n".join([x.strip() for x in text.split("\n")])
0053 
0054     # Mask all paragraph breaks.
0055     pbmask = "\x04\x04"
0056     text = _nlgr_rx.sub(pbmask, text)
0057 
0058     # Replace all whitespace groups with single space.
0059     text = _wsgr_rx.sub(" ", text)
0060 
0061     # Unmask paragraph breaks.
0062     text = text.replace(pbmask, "\n\n")
0063 
0064     return text
0065 
0066 
0067 xml_entities = {
0068     "lt": "<",
0069     "gt": ">",
0070     "apos": "'",
0071     "quot": "\"",
0072     "amp": "&",
0073 }
0074 
0075 WS_SPACE = "\x04~sp"
0076 WS_TAB = "\x04~tb"
0077 WS_NEWLINE = "\x04~nl"
0078 _ws_masks = {
0079     WS_SPACE: " ",
0080     WS_TAB: "\t",
0081     WS_NEWLINE: "\n",
0082 }
0083 _ws_unmasks = dict([(y, x) for x, y in list(_ws_masks.items())])
0084 
0085 def xml_to_plain (text, tags=None, subs={}, ents={}, keepws=set(),
0086                   ignels=set()):
0087     """
0088     Convert any XML-like markup to plain text.
0089 
0090     By default, all tags in the text are replaced with a single space;
0091     entities, unless one of the XML default (C{&lt;}, C{&gt;}, C{&amp;},
0092     C{&quot;}, C{&apos;}), are left untouched;
0093     all whitespace groups are simplified to single space and leading and
0094     trailing removed.
0095 
0096     If only a particular subset of tags should be taken into account, it can
0097     be specified by the C{tags} parameter, as a sequence of tag names
0098     (the sequence is internally converted to set before processing).
0099 
0100     If a tag should be replaced with a special sequence of characters
0101     (either opening or closing tag), or the text wrapped by it replaced too,
0102     this can be specified by the C{subs} parameter. It is a dictionary of
0103     3-tuples by tag name, which tells what to replace with the opening tag,
0104     the closing tag, and the wrapped text. For example, to replace
0105     C{<i>foobar</i>} with C{/foobar/}, the dictionary entry would be
0106     C{{"i": ("/", "/", None)}} (where final C{None} states not to touch
0107     the wrapped text); to replace C{<code>...</code>} with C{@@@}
0108     (i.e. remove code segment completely but leave in a marker that there
0109     was something), the entry is C{{"code": ("", "", "@@@")}}.
0110     The replacement for the wrapped text can also be a function,
0111     taking a string and returning a string.
0112     Note that whitespace is automatically simplified, so if whitespace
0113     given by the replacements should be exactly preserved, use C{WS_*}
0114     string constants in place of corresponding whitespace characters.
0115 
0116     To have some entities other than the XML default replaced with proper
0117     values, a dictionary of known entities with values may be provided using
0118     the C{ents} parameter.
0119 
0120     Whitespace can be preserved within some elements, as given by
0121     their tags in the C{keepws} sequence.
0122 
0123     Some elements may be completely removed, as given by the C{ignels} sequence.
0124     Each element of the sequence should either be a tag, or a (tag, type) tuple,
0125     where type is the value of the C{type} argument to element, if any.
0126 
0127     It is assumed that the markup is well-formed, and if it is not
0128     the result is undefined; but best attempt at conversion is made.
0129 
0130     There are several other functions in this module which deal with well known
0131     markups, such that it is not necessary to use this function with
0132     C{tags}, C{subs}, or C{ents} manually specified.
0133 
0134     If you only want to resolve entities from a known set, instead of
0135     calling this function with empty C{tags} and entities given in C{ents},
0136     consider using the more powerfull L{pology.resolve.resolve_entities}.
0137 
0138     @param text: markup text to convert to plain
0139     @type text: string
0140     @param tags: known tags
0141     @type tags: sequence of strings
0142     @param subs: replacement specification
0143     @type subs: dictionary of 3-tuples
0144     @param ents: known entities and their values
0145     @type ents: dictionary
0146     @param keepws: tags of elements in which to preserve whitespace
0147     @type keepws: sequence of strings
0148     @param ignels: tags or tag/types or elements to completely remove
0149     @type ignels: sequence of strings and (string, string) tuples
0150 
0151     @returns: plain text version
0152     @rtype: string
0153     """
0154 
0155     # Convert some sequences to sets, for faster membership checks.
0156     if tags is not None and not isinstance(tags, set):
0157         tags = set(tags)
0158     if not isinstance(keepws, set):
0159         keepws = set(keepws)
0160     if not isinstance(ignels, set):
0161         ignels = set(ignels)
0162 
0163     # Resolve user-supplied entities before tags,
0164     # as they may contain more markup.
0165     # (Resolve default entities after tags,
0166     # because the default entities can introduce invalid markup.)
0167     text = _resolve_ents(text, ents, xml_entities)
0168 
0169     # Build element tree, trying to work around badly formed XML
0170     # (but do note when the closing element is missing).
0171     # Element tree is constructed as list of tuples:
0172     # (tag, opening_tag_literal, closing_tag_literal, atype, content)
0173     # where atype is the value of type attribute (if any),
0174     # and content is a sublist for given element;
0175     # tag may be #text, when the content is string.
0176     eltree = []
0177     curel = eltree
0178     parent = []
0179     any_tag = False
0180     p = 0
0181     while True:
0182         pp = p
0183         p = text.find("<", p)
0184         if p < 0:
0185             break
0186         curel.append(("#text", None, None, None, text[pp:p]))
0187         tag_literal, tag, atype, opening, closing, p = _parse_tag(text, p)
0188         if p < 0:
0189             break
0190         if opening: # opening tag
0191             any_tag = True
0192             curel.append([tag, tag_literal, None, atype, []])
0193             parent.append(curel)
0194             curel = curel[-1][-1]
0195         if closing: # closing tag (can be both opening and closing)
0196             if parent:
0197                 curel = parent.pop()
0198                 if not opening:
0199                     # Record closing tag literal if not opening as well.
0200                     curel[-1][2] = tag_literal
0201             else: # faulty markup, move top element
0202                 eltree = [[tag, None, tag_literal, None, curel]]
0203                 curel = eltree
0204     curel.append(("#text", None, None, None, text[pp:]))
0205 
0206     # Replace tags.
0207     text = _resolve_tags(eltree, tags, subs, keepws, ignels)
0208 
0209     # Resolve default entities.
0210     text = _resolve_ents(text, xml_entities)
0211 
0212     return text
0213 
0214 
0215 def _parse_tag (text, p):
0216     # text[p] must be "<"
0217 
0218     tag = ""
0219     atype = None
0220     opening = True
0221     closing = False
0222 
0223     tlen = len(text)
0224     pp = p
0225     in_str = False
0226     in_tag = False
0227     in_attr = False
0228     in_lead = True
0229     in_afterslash = False
0230     in_aftereq = False
0231     in_aftertag = False
0232     in_afterattr = False
0233     ntag = ""
0234     nattr = ""
0235     while True:
0236         p += 1
0237         if p >= tlen:
0238             break
0239 
0240         if in_lead and not text[p].isspace():
0241             in_lead = False
0242             opening = text[p] != "/"
0243             if opening:
0244                 in_tag = True
0245                 p_tag = p
0246             else:
0247                 in_afterslash = True
0248         elif in_afterslash and not text[p].isspace():
0249             in_afterslash = False
0250             in_tag = True
0251             p_tag = p
0252         elif in_tag and (text[p].isspace() or text[p] in "/>"):
0253             in_tag = False
0254             in_aftertag = True
0255             tag = text[p_tag:p]
0256             ntag = tag.lower()
0257         elif in_aftertag and not (text[p].isspace() or text[p] in "/>"):
0258             in_aftertag = False
0259             in_attr = True
0260             p_attr = p
0261         elif in_attr and (text[p].isspace() or text[p] in "=/>"):
0262             in_attr = False
0263             if text[p] != "=":
0264                 in_afterattr = True
0265             else:
0266                 in_aftereq = True
0267             attr = text[p_attr:p]
0268             nattr = attr.lower()
0269         elif in_aftereq and text[p] in ('"', "'"):
0270             in_aftereq = False
0271             in_str = True
0272             quote_char = text[p]
0273             p_str = p + 1
0274         elif in_str and text[p] == quote_char:
0275             in_str = False
0276             s = text[p_str:p].strip().replace(" ", "")
0277             if nattr == "type":
0278                 atype = s
0279         elif in_afterattr and text[p] == "=":
0280             in_afterattr = False
0281             in_aftereq = True
0282 
0283         if not in_str and text[p] == "/":
0284             closing = True
0285         if not in_str and text[p] == ">":
0286             break
0287 
0288     p += 1
0289     tag_literal = text[pp:p]
0290 
0291     return tag_literal, tag, atype, opening, closing, p
0292 
0293 
0294 _entity_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U)
0295 
0296 def _resolve_ents (text, ents={}, ignents={}):
0297     """
0298     Resolve XML entities as described in L{xml_to_plain}, ignoring some.
0299     """
0300 
0301     # There may be entities within entities, so replace entities in each
0302     # entity value too before substituting in the main text.
0303     ntext = []
0304     p = 0
0305     while True:
0306         pp = p
0307         p = text.find("&", p)
0308         if p < 0:
0309             break
0310         ntext.append(text[pp:p])
0311         m = _entity_rx.match(text, p)
0312         if m:
0313             name = m.group(1)
0314             if name not in ignents:
0315                 value = ents.get(name)
0316                 if value is not None:
0317                     # FIXME: Endless recursion if the entity repeats itself.
0318                     value = _resolve_ents(value, ents, ignents)
0319                     ntext.append(value)
0320                 else:
0321                     # Put entity back as-is.
0322                     ntext.append(m.group(0))
0323             else: # ignored entity, do not touch
0324                 ntext.append(text[p:m.span()[1]])
0325             p = m.span()[1]
0326         else:
0327             ntext.append(text[p]) # the ampersand
0328             p += 1
0329     ntext.append(text[pp:])
0330     text = "".join(ntext)
0331 
0332     return text
0333 
0334 
0335 # Ordinary around masked whitespace.
0336 _wsgr_premask_rx = re.compile(r"\s+(\x04~\w\w)")
0337 _wsgr_postmask_rx = re.compile(r"(\x04~\w\w)\s+")
0338 
0339 def _resolve_tags (elseq, tags=None, subs={}, keepws=set(), ignels=set()):
0340     """
0341     Replace XML tags as described in L{xml_to_plain}, given the parsed tree.
0342     Split into top and recursive part.
0343     """
0344 
0345     # Text with masked whitespace where significant.
0346     text = _resolve_tags_r(elseq, tags, subs, keepws, ignels)
0347 
0348     # Simplify whitespace.
0349     text = _wsgr_rx.sub(" ", text)
0350     text = _wsgr_premask_rx.sub(r"\1", text)
0351     text = _wsgr_postmask_rx.sub(r"\1", text)
0352     text = text.strip()
0353 
0354     # Unmask significant whitespace.
0355     text = _unmask_ws(text)
0356 
0357     # Remove excess newlines even if supposedly significant.
0358     text = text.strip("\n")
0359     text = _nlgr_rx.sub("\n\n", text)
0360 
0361     return text
0362 
0363 
0364 def _resolve_tags_r (elseq, tags=None, subs={}, keepws=set(), ignels=set()):
0365 
0366     segs = []
0367     for el in elseq:
0368         if el[0] in ignels or (el[0], el[3]) in ignels:
0369             # Complete element is ignored (by tag, or tag/type).
0370             continue
0371 
0372         if el[0] == "#text":
0373             segs.append(el[-1])
0374         elif tags is None or el[0] in tags:
0375             repl_pre, repl_post, repl_cont = subs.get(el[0], [" ", " ", None])
0376             if repl_pre is None:
0377                 repl_pre = ""
0378             if repl_post is None:
0379                 repl_post = ""
0380             repl_cont_orig = repl_cont
0381             if not isinstance(repl_cont, str):
0382                 repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels)
0383                 if el[0] in keepws:
0384                     # Mask whitespace in wrapped text.
0385                     repl_cont = _mask_ws(repl_cont)
0386             if callable(repl_cont_orig):
0387                 repl_cont = repl_cont_orig(repl_cont)
0388             # If space not significant,
0389             # find first non-whitespace characters in wrapped text
0390             # and shift them before surrounding replacements.
0391             if el[0] not in keepws:
0392                 lcont = len(repl_cont)
0393                 p1 = 0
0394                 while p1 < lcont and repl_cont[p1].isspace():
0395                     p1 += 1
0396                 p2 = lcont - 1
0397                 while p2 > 0 and repl_cont[p2].isspace():
0398                     p2 -= 1
0399                 repl_pre = repl_cont[:p1] + repl_pre
0400                 repl_post = repl_post + repl_cont[p2+1:]
0401                 repl_cont = repl_cont[p1:p2+1]
0402             segs.append(repl_pre + repl_cont + repl_post)
0403         else:
0404             # Ignored tag, put back verbatim.
0405             repl_pre = el[1]
0406             if repl_pre is None:
0407                 repl_pre = ""
0408             repl_post = el[2]
0409             if repl_post is None:
0410                 repl_post = ""
0411             repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels)
0412             segs.append(repl_pre + repl_cont + repl_post)
0413 
0414     return "".join(segs)
0415 
0416 
0417 def _mask_ws (text):
0418 
0419     for mask, ws in list(_ws_masks.items()):
0420         text = text.replace(ws, mask)
0421     return text
0422 
0423 
0424 def _unmask_ws (text):
0425 
0426     for mask, ws in list(_ws_masks.items()):
0427         text = text.replace(mask, ws)
0428     return text
0429 
0430 _html_tags = set("""
0431     a address applet area b base basefont big blockquote body br button
0432     caption center cite code col colgroup dd del dfn dir div dl dt
0433     em fieldset font form frame frameset h1 h2 h3 h4 h5 h6 head hr html
0434     i iframe img input ins isindex kbd label legend li link map menu meta
0435     noframes noscript ol option p param pre
0436     s samp script select small span strike strong style sub sup
0437     table tbody td textarea tfoot th thead title tr tt u ul var xmp
0438 """.split())
0439 _html_subs = {
0440     "_nows" : ("", "", None),
0441     "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None),
0442 }
0443 _html_subs.update([(x, _html_subs["_nows"]) for x in _html_tags])
0444 _html_subs.update([(x, _html_subs["_parabr"]) for x in
0445                    "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr"
0446                    "".split()])
0447 _html_ents = { # in addition to default XML entities
0448     "nbsp": "\xa0",
0449 }
0450 _html_keepws = set("""
0451     code pre xmp
0452 """.split())
0453 _html_ignels = set([
0454     ("style", "text/css"),
0455 ])
0456 
0457 def html_to_plain (text):
0458     """
0459     Convert HTML markup to plain text.
0460 
0461     @param text: HTML text to convert to plain
0462     @type text: string
0463 
0464     @returns: plain text version
0465     @rtype: string
0466     """
0467 
0468     return xml_to_plain(text, _html_tags, _html_subs, _html_ents,
0469                               _html_keepws, _html_ignels)
0470 
0471 
0472 def html_plain (*args, **kwargs):
0473     """
0474     Deprecated name for L{html_to_plain}.
0475     """
0476     return html_to_plain(*args, **kwargs)
0477 
0478 
0479 _qtrich_tags = set("""
0480     qt html
0481     a b big blockquote body br center cite code dd dl dt em font
0482     h1 h2 h3 h4 h5 h6 head hr i img li meta nobr ol p pre
0483     s span strong style sub sup table td th tr tt u ul var
0484 """.split())
0485 _qtrich_subs = {
0486     "_nows" : ("", "", None),
0487     "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None),
0488 }
0489 _qtrich_subs.update([(x, _qtrich_subs["_nows"]) for x in _qtrich_tags])
0490 _qtrich_subs.update([(x, _qtrich_subs["_parabr"]) for x in
0491                    "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr"
0492                    "".split()])
0493 _qtrich_ents = { # in addition to default XML entities
0494     "nbsp": "\xa0",
0495 }
0496 _qtrich_keepws = set("""
0497     code pre
0498 """.split())
0499 _qtrich_ignels = set([
0500     ("style", "text/css"),
0501 ])
0502 
0503 def qtrich_to_plain (text):
0504     """
0505     Convert Qt rich-text markup to plain text.
0506 
0507     @param text: Qt rich text to convert to plain
0508     @type text: string
0509 
0510     @returns: plain text version
0511     @rtype: string
0512     """
0513 
0514     return xml_to_plain(text, _qtrich_tags, _qtrich_subs, _qtrich_ents,
0515                               _qtrich_keepws, _qtrich_ignels)
0516 
0517 
0518 _kuit_tags = set("""
0519     kuit kuil title subtitle para list item note warning
0520     filename link application command resource icode bcode shortcut interface
0521     emphasis placeholder email envar message numid nl
0522 """.split())
0523 _kuit_subs = {
0524     "_nows" : ("", "", None),
0525     "_parabr" : ("", WS_NEWLINE*2, None),
0526     "_ws" : (" ", " ", None),
0527     "_ui" : ("[", "]", None),
0528 }
0529 _kuit_subs.update([(x, _kuit_subs["_nows"]) for x in _kuit_tags])
0530 _kuit_subs.update([(x, _kuit_subs["_ws"]) for x in
0531                    "placeholder".split()])
0532 _kuit_subs.update([(x, _kuit_subs["_parabr"]) for x in
0533                    "title subtitle para item nl"
0534                    "".split()])
0535 _kuit_subs.update([(x, _kuit_subs["_ui"]) for x in
0536                    "interface".split()])
0537 _kuit_ents = { # in addition to default XML entities
0538 }
0539 _kuit_keepws = set("""
0540     icode bcode
0541 """.split())
0542 _kuit_ignels = set([
0543 ])
0544 
0545 def kuit_to_plain (text):
0546     """
0547     Convert KUIT markup to plain text.
0548 
0549     @param text: KUIT text to convert to plain
0550     @type text: string
0551 
0552     @returns: plain text version
0553     @rtype: string
0554     """
0555 
0556     return xml_to_plain(text, _kuit_tags, _kuit_subs, _kuit_ents,
0557                               _kuit_keepws, _kuit_ignels)
0558 
0559 
0560 _htkt_tags = set(list(_qtrich_tags) + list(_kuit_tags))
0561 _htkt_subs = dict(list(_qtrich_subs.items()) + list(_kuit_subs.items()))
0562 _htkt_ents = dict(list(_qtrich_ents.items()) + list(_kuit_ents.items()))
0563 _htkt_keepws = set(list(_qtrich_keepws) + list(_kuit_keepws))
0564 _htkt_ignels = set(list(_qtrich_ignels) + list(_kuit_ignels))
0565 
0566 def kde4_to_plain (text):
0567     """
0568     Convert KDE4 GUI markup to plain text.
0569 
0570     KDE4 GUI texts may contain both Qt rich-text and KUIT markup,
0571     even mixed in the same text.
0572     Note that the conversion cannot be achieved, in general, by first
0573     converting Qt rich-text, and then KUIT, or vice versa.
0574     For example, if the text has C{&lt;} entity, after first conversion
0575     it will become plain C{<}, and interfere with second conversion.
0576 
0577     @param text: KDE4 text to convert to plain
0578     @type text: string
0579 
0580     @returns: plain text version
0581     @rtype: string
0582     """
0583 
0584     return xml_to_plain(text, _htkt_tags, _htkt_subs, _htkt_ents,
0585                               _htkt_keepws, _htkt_ignels)
0586 
0587 
0588 # Assembled on first use.
0589 _dbk_tags = None
0590 _dbk_subs = None
0591 _dbk_ents = None
0592 _dbk_keepws = None
0593 _dbk_ignels = None
0594 
0595 def _prep_docbook4_to_plain ():
0596 
0597     global _dbk_tags, _dbk_subs, _dbk_ents, _dbk_keepws, _dbk_ignels
0598 
0599     specpath = os.path.join(datadir(), "spec", "docbook4.l1")
0600     docbook4_l1 = collect_xml_spec_l1(specpath)
0601     _dbk_tags = set(docbook4_l1.keys())
0602 
0603     _dbk_subs = {
0604         "_nows" : ("", "", None),
0605         "_parabr" : ("", WS_NEWLINE*2, None),
0606         "_ws" : (" ", " ", None),
0607         "_ui" : ("[", "]", None),
0608         "_uipath" : ("", "", lambda s: re.sub(r"\]\s*\[", "->", s, re.U)),
0609     }
0610     _dbk_subs.update([(x, _dbk_subs["_nows"]) for x in _dbk_tags])
0611     _dbk_subs.update([(x, _dbk_subs["_parabr"]) for x in
0612                       "para title".split()]) # FIXME: Add more.
0613     _dbk_subs.update([(x, _dbk_subs["_ws"]) for x in
0614                        "contrib address firstname placeholder surname "
0615                        "primary secondary "
0616                        "".split()])
0617     _dbk_subs.update([(x, _dbk_subs["_ui"]) for x in
0618                        "guilabel guibutton guiicon guimenu guisubmenu "
0619                        "guimenuitem "
0620                        "".split()])
0621     _dbk_subs.update([(x, _dbk_subs["_uipath"]) for x in
0622                        "menuchoice "
0623                        "".split()])
0624 
0625     _dbk_ents = { # in addition to default XML entities
0626     }
0627 
0628     _dbk_keepws = set("""
0629         screen programlisting
0630     """.split()) # FIXME: Add more.
0631 
0632     _dbk_ignels = set([
0633     ])
0634 
0635 def docbook4_to_plain (text):
0636     """
0637     Convert Docbook 4.x markup to plain text.
0638 
0639     @param text: Docbook text to convert to plain
0640     @type text: string
0641 
0642     @returns: plain text version
0643     @rtype: string
0644     """
0645 
0646     if _dbk_tags is None:
0647         _prep_docbook4_to_plain()
0648 
0649     return xml_to_plain(text, _dbk_tags, _dbk_subs, _dbk_ents,
0650                               _dbk_keepws, _dbk_ignels)
0651 
0652 
0653 def collect_xml_spec_l1 (specpath):
0654     """
0655     Collect lightweight XML format specification, level 1.
0656 
0657     Level 1 specification is the dictionary of all known tags,
0658     with allowed attributes and subtags for each.
0659 
0660     File of the level 1 specification is in the following format::
0661 
0662         # A comment.
0663         # Tag with unconstrained attributes and subtags:
0664         tagA;
0665         # Tag with constrained attributes and unconstrained subtags:
0666         tagF : attr1 attr2 ...;
0667         # Tag with unconstrained attributes and constrained subtags:
0668         tagF > stag1 stag2 ...;
0669         # Tag with constrained attributes and subtags:
0670         tagF : attr1 attr2 ... > stag1 stag2 ...;
0671         # Tag with no attributes and unconstrained subtags:
0672         tagA :;
0673         # Tag with unconstrained attributes and no subtags:
0674         tagA >;
0675         # Tag with no attributes and no subtags:
0676         tagA :>;
0677         # Attribute value constrained by a regular expression:
0678         .... attr1=/^(val1|val2|val3)$/i ...
0679         # Reserved dummy tag specifying attributes common to all tags:
0680         pe-common-attrib : attrX attrY;
0681 
0682     The specification can contain a dummy tag named C{pe-common-attrib},
0683     stating attributes which are common to all tags, instead of having to
0684     list them with each and every tag.
0685     To make an attribute mandatory, it's name should be prefixed by
0686     exclamation sign (!).
0687 
0688     Specification file must be UTF-8 encoded.
0689 
0690     @param specpath: path to level 1 specification file
0691     @type specpath: string
0692 
0693     @return: level 1 specification
0694     @rtype: dict
0695     """
0696 
0697     ch_comm = "#"
0698     ch_attr = ":"
0699     ch_attre = "="
0700     ch_mattr = "!"
0701     ch_stag = ">"
0702     ch_end = ";"
0703 
0704     dtag_attr = "pe-common-attrib"
0705 
0706     valid_tag_rx = re.compile(r"^[\w-]+$")
0707     valid_attr_rx = re.compile(r"^[\w-]+$")
0708 
0709     c_tag, c_attr, c_attre, c_stag = list(range(4))
0710 
0711     ifs = codecs.open(specpath, "r", "UTF-8").read()
0712     lenifs = len(ifs)
0713 
0714     pos = [0, 1, 1]
0715 
0716     def signal (msg, bpos):
0717 
0718         emsg = _("@info \"L1-spec\" is shorthand for "
0719                  "\"level 1 specification\"",
0720                  "[L1-spec] %(file)s:%(line)d:%(col)d: %(msg)s",
0721                  file=specpath, line=bpos[0], col=bpos[1], msg=msg)
0722         raise PologyError(emsg)
0723 
0724     def advance (stoptest, cmnt=True):
0725 
0726         ind = pos[0]
0727         oind = ind
0728         substr = []
0729         sep = None
0730         while ind < lenifs and sep is None:
0731             if cmnt and ifs[ind] == ch_comm:
0732                 ind = ifs.find("\n", ind)
0733                 if ind < 0:
0734                     break
0735             else:
0736                 sep = stoptest(ind)
0737                 if sep is None:
0738                     substr.append(ifs[ind])
0739                     ind += 1
0740                 else:
0741                     ind += len(sep)
0742 
0743         pos[0] = ind
0744         rawsubstr = ifs[oind:ind]
0745         p = rawsubstr.rfind("\n")
0746         if p >= 0:
0747             pos[1] += rawsubstr.count("\n")
0748             pos[2] = len(rawsubstr) - p
0749         else:
0750             pos[2] += len(rawsubstr)
0751 
0752         return "".join(substr), sep
0753 
0754     def make_rx_lint (rx_str, rx_flags, wch, lincol):
0755         try:
0756             rx = re.compile(rx_str, rx_flags)
0757         except:
0758             signal(_("@info the regex is already quoted when inserted",
0759                      "Cannot compile regular expression %(regex)s.",
0760                      regex=(wch + rx_str + wch)),
0761                      lincol)
0762         return lambda x: rx.search(x) is not None
0763 
0764     spec = {}
0765     ctx = c_tag
0766     entry = None
0767     while pos[0] < lenifs:
0768         if ctx == c_tag:
0769             t = lambda i: (    ifs[i] in (ch_attr, ch_stag, ch_end)
0770                            and ifs[i] or None)
0771             tag, sep = advance(t)
0772             tag = tag.strip()
0773             if tag:
0774                 if sep is None:
0775                     signal(_("@info",
0776                              "Entry not terminated after the initial tag."),
0777                            lincol)
0778                 if not valid_tag_rx.search(tag) and tag != dtag_attr:
0779                     signal(_("@info",
0780                              "Invalid tag name '%(tag)s'.", tag=tag),
0781                              lincol)
0782                 entry = _L1Element(tag)
0783                 spec[tag] = entry
0784 
0785             if sep == ch_attr:
0786                 ctx = c_attr
0787             elif sep == ch_stag:
0788                 ctx = c_stag
0789             elif sep == ch_end:
0790                 ctx = c_tag
0791             else:
0792                 break
0793 
0794         elif ctx == c_attr:
0795             if entry.attrs is None:
0796                 entry.attrs = set()
0797 
0798             lincol = tuple(pos[1:])
0799             t = lambda i: (    (   ifs[i].isspace()
0800                                 or ifs[i] in (ch_attre, ch_stag, ch_end))
0801                            and ifs[i] or [None])[0]
0802             attr, sep = advance(t)
0803             attr = attr.strip()
0804             if attr:
0805                 if attr.startswith(ch_mattr):
0806                     attr = attr[len(ch_mattr):]
0807                     entry.mattrs.add(attr)
0808                 if attr in entry.attrs:
0809                     signal(_("@info",
0810                              "Duplicate attribute '%(attr)s'.", attr=attr),
0811                              lincol)
0812                 if not valid_attr_rx.search(attr):
0813                     signal(_("@info",
0814                              "Invalid attribute name '%(attr)s'.", attr=attr),
0815                              lincol)
0816                 entry.attrs.add(attr)
0817                 lastattr = attr
0818 
0819             if sep.isspace():
0820                 ctx = c_attr
0821             elif sep == ch_attre:
0822                 ctx = c_attre
0823             elif sep == ch_stag:
0824                 ctx = c_stag
0825             elif sep == ch_end:
0826                 ctx = c_tag
0827             else:
0828                 signal(_("@info",
0829                          "Entry not terminated after the attribute list."),
0830                        lincol)
0831 
0832         elif ctx == c_attre:
0833             lincol = tuple(pos[1:])
0834             t = lambda i: not ifs[i].isspace() and ifs[i] or None
0835             sub, wch = advance(t)
0836             if wch is None:
0837                 signal(_("@info",
0838                          "End of input inside the value constraint."),
0839                        lincol)
0840             t = lambda i: ifs[i] == wch and ifs[i] or None
0841             rx_str, sep = advance(t, cmnt=False)
0842             if sep is None:
0843                 signal(_("@info",
0844                          "End of input inside the value constraint."),
0845                        lincol)
0846             t = lambda i: (not ifs[i].isalpha() and [""] or [None])[0]
0847             rx_flag_spec, sep = advance(t)
0848             rx_flags = re.U
0849             seen_flags = set()
0850             lincol = tuple(pos[1:])
0851             for c in rx_flag_spec:
0852                 if c in seen_flags:
0853                     signal(_("@info",
0854                              "Regex flag '%(flag)s' is already issued.",
0855                              flag=c), lincol)
0856                 if c == "i":
0857                     rx_flags |= re.I
0858                 else:
0859                     signal(_("@info",
0860                              "Unknown regex flag '%(flag)s'.", flag=c),
0861                              lincol)
0862                 seen_flags.add(c)
0863             entry.avlints[lastattr] = make_rx_lint(rx_str, rx_flags,
0864                                                    wch, lincol)
0865             ctx = c_attr
0866 
0867         elif ctx == c_stag:
0868             if entry.stags is None:
0869                 entry.stags = set()
0870 
0871             lincol = tuple(pos[1:])
0872             t = lambda i: (    (ifs[i].isspace() or ifs[i] == ch_end)
0873                            and ifs[i] or [None])[0]
0874             stag, sep = advance(t)
0875             stag = stag.strip()
0876             if stag:
0877                 if stag in entry.stags:
0878                     signal(_("@info",
0879                              "Repeated subtag '%(tag)s'.", tag=stag),
0880                              lincol)
0881                 entry.stags.add(stag)
0882 
0883             if sep == ch_end:
0884                 ctx = c_tag
0885             else:
0886                 signal(_("@info",
0887                          "Entry not terminated after the subtag list."),
0888                        lincol)
0889 
0890     # Add common attributes to each tag.
0891     dentry_attr = spec.pop(dtag_attr, [])
0892     if dentry_attr:
0893         for attr in dentry_attr.attrs:
0894             attre = dentry_attr.avlints.get(attr)
0895             for entry in list(spec.values()):
0896                 if entry.attrs is None:
0897                     entry.attrs = set()
0898                 if attr not in entry.attrs:
0899                     entry.attrs.add(attr)
0900                     if attre:
0901                         entry.avlints[attr] = attre
0902 
0903     return spec
0904 
0905 
0906 class _L1Element:
0907 
0908     def __init__ (self, tag=None, attrs=None, mattrs=None, avlints=None,
0909                   stags=None):
0910 
0911         # The tag of this element (string).
0912         self.tag = tag
0913         # Possible attributes (set, or None meaning any).
0914         self.attrs = attrs
0915         # Mandatory attributes (set).
0916         self.mattrs = mattrs or set()
0917         # Validator functions for attribute values, per attribute (dict).
0918         # Validator does not have to be defined for each attribute.
0919         self.avlints = avlints or {}
0920         # Possible subelements by tag (set, or None meaning any).
0921         self.stags = stags
0922 
0923 
0924 # Simplified matching of XML entity name (sans ampersand and semicolon).
0925 _simple_ent_rx = re.compile(r"^([\w.:-]+|#[0-9]+)$", re.U);
0926 
0927 # Get line/column segment in error report.
0928 _lin_col_rx = re.compile(r":\s*line\s*\d+,\s*column\s*\d+", re.I)
0929 
0930 # Dummy top tag for topless texts.
0931 _dummy_top = "_"
0932 
0933 
0934 # Global data for XML checking.
0935 class _Global: pass
0936 _g_xml_l1 = _Global()
0937 
0938 def validate_xml_l1 (text, spec=None, xmlfmt=None, ents=None,
0939                      casesens=True, accelamp=False):
0940     """
0941     Validate XML markup in text against L{level1<collect_xml_spec_l1>}
0942     specification.
0943 
0944     Text is not required to have a top tag; if it does not, a dummy one will
0945     be assigned to assure that the check passes.
0946 
0947     If C{spec} is C{None}, text is only checked to be well-formed.
0948 
0949     If C{ents} are C{None}, entities in the text are ignored by the check;
0950     otherwise, an entity not belonging to the known set is considered erroneous.
0951     Default XML entities (C{&lt;}, C{&gt;}, C{&amp;}, C{&quot;}, C{&apos;})
0952     are automatically added to the set of known entities.
0953 
0954     Tag and attribute names can be made case-insensitive by setting
0955     C{casesens} to C{False}.
0956 
0957     If text is a part of user interface, and the environment may use
0958     the literal ampersand as accelerator marker, it can be allowed to pass
0959     the check by setting C{accelamp} to C{True}.
0960 
0961     Text can be one or more entity definitions of the form C{<!ENTITY ...>},
0962     when special check is applied.
0963 
0964     The result of the check is list of erroneous spans in the text,
0965     each given by start and end index (in Python standard semantics),
0966     and the error description, packed in a tuple.
0967     If there are no errors, empty list is returned.
0968     Reported spans need not be formally complete with respect to the error
0969     location, but are heuristically determined to be short and
0970     provide good visual indication of what triggers the error.
0971 
0972     @param text: text to check
0973     @type text: string
0974     @param spec: markup definition
0975     @type spec: L{level1<collect_xml_spec_l1>} specification
0976     @param xmlfmt: name of the particular XML format (for error messages)
0977     @type xmlfmt: string
0978     @param ents: set of known entities
0979     @type ents: sequence
0980     @param casesens: whether tag names are case-insensitive
0981     @type casesens: bool
0982     @param accelamp: whether to allow ampersand as accelerator marker
0983     @type accelamp: bool
0984 
0985     @returns: erroneous spans in the text
0986     @rtype: list of (int, int, string) tuples
0987     """
0988 
0989     if text.lstrip().startswith("<!ENTITY"):
0990         return _validate_xml_entdef(text, xmlfmt)
0991 
0992     # If ampersand accelerator marked allowed, replace one in non-entity
0993     # position with &amp;, to let the parser proceed.
0994     text_orig = text
0995     if accelamp:
0996         text = _escape_amp_accel(text)
0997 
0998     # Make sure the text has a top tag.
0999     text = "<%s>%s</%s>" % (_dummy_top, text, _dummy_top)
1000 
1001     # Prepare parser.
1002     xenc = "UTF-8"
1003     parser = xml.parsers.expat.ParserCreate(xenc)
1004     parser.UseForeignDTD() # not to barf on non-default XML entities
1005     parser.StartElementHandler = _handler_start_element
1006     parser.DefaultHandler = _handler_default
1007 
1008     # Link state for handlers.
1009     g = _g_xml_l1
1010     g.text = text
1011     g.spec = spec
1012     g.xmlfmt = xmlfmt or "XML"
1013     g.ents = ents
1014     g.casesens = casesens
1015     g.xenc = xenc
1016     g.parser = parser
1017     g.errcnt = 0
1018     g.spans = []
1019     g.tagstack = []
1020 
1021     # Parse and check.
1022     try:
1023         parser.Parse(text.encode(xenc), True)
1024     except xml.parsers.expat.ExpatError as e:
1025         errmsg = _("@info a problem in the given type of markup "
1026                    "(e.g. HTML, Docbook)",
1027                    "%(mtype)s markup: %(snippet)s.",
1028                    mtype=g.xmlfmt, snippet=e.args[0])
1029         span = _make_span(text, e.lineno, e.offset, errmsg)
1030         g.spans.append(span)
1031 
1032     # Adapt spans back to original text.
1033     pure_spans = [x[:2] for x in g.spans]
1034     pure_spans = adapt_spans(text_orig, text, pure_spans, merge=False)
1035     # Remove unhelpful line/column in error messages.
1036     errmsgs = []
1037     for errmsg, span in zip([x[2] for x in g.spans], pure_spans):
1038         m = _lin_col_rx.search(errmsg)
1039         if m:
1040             errmsg = errmsg[:m.start()] + errmsg[m.end():]
1041         errmsgs.append(errmsg)
1042     # Put spans back together.
1043     g.spans = [x + (y,) for x, y in zip(pure_spans, errmsgs)]
1044 
1045     return g.spans
1046 
1047 
1048 _ts_fence = "|/|"
1049 
1050 def _escape_amp_accel (text):
1051 
1052     p_ts = text.find(_ts_fence)
1053     in_script = False
1054 
1055     p1 = 0
1056     found_accel = False
1057     while True:
1058 
1059         # Bracket possible entity reference.
1060         p1 = text.find("&", p1)
1061         if p1 < 0:
1062             break
1063         if not in_script and p_ts >= 0 and p1 > p_ts:
1064             in_script = True
1065             found_accel = False
1066         p2 = text.find(";", p1)
1067 
1068         # An accelerator marker if no semicolon in rest of the text
1069         # or the bracketed segment does not look like an entity,
1070         # and it is in front of an alphanumeric or itself.
1071         nc = text[p1 + 1:p1 + 2]
1072         if (    (p2 < 0 or not _simple_ent_rx.match(text[p1 + 1:p2]))
1073             and (nc.isalnum() or nc == "&")
1074         ):
1075             # Check if the next one is an ampersand too,
1076             # i.e. if it's a self-escaped accelerator marker.
1077             namp = 1
1078             if (    text[p1 + 1:p1 + 2] == "&"
1079                 and not _simple_ent_rx.match(text[p1 + 2:p2])
1080             ):
1081                 namp += 1
1082 
1083             # Escape the marker if first or self-escaped,
1084             # or currently in scripted part (in which there can be
1085             # any number of non-escaped markers).
1086             if not found_accel or namp > 1 or in_script:
1087                 escseg = "&amp;" * namp
1088                 text = text[:p1] + escseg + text[p1 + namp:]
1089                 p1 += len(escseg)
1090                 if namp == 1:
1091                     found_accel = True
1092             else:
1093                 p1 += namp
1094 
1095         elif p2 > p1:
1096             p1 = p2
1097         else:
1098             break
1099 
1100     return text
1101 
1102 
1103 def _handler_start_element (tag, attrs):
1104 
1105     g = _g_xml_l1
1106 
1107     if g.spec is None:
1108         return
1109 
1110     # Normalize names to lower case if allowed.
1111     if not g.casesens:
1112         tag = tag.lower()
1113         attrs = dict([(x.lower(), y) for x, y in list(attrs.items())])
1114 
1115     # Check existence of the tag.
1116     if tag not in g.spec and tag != _dummy_top:
1117         errmsg = _("@info",
1118                    "%(mtype)s markup: unrecognized tag '%(tag)s'.",
1119                    mtype=g.xmlfmt, tag=tag)
1120         span = _make_span(g.text, g.parser.CurrentLineNumber,
1121                           g.parser.CurrentColumnNumber + 1, errmsg)
1122         g.spans.append(span)
1123         return
1124 
1125     if tag == _dummy_top:
1126         return
1127 
1128     elspec = g.spec[tag]
1129     errmsgs = []
1130 
1131     # Check applicability of attributes and validity of their values.
1132     if elspec.attrs is not None:
1133         for attr, aval in list(attrs.items()):
1134             if attr not in elspec.attrs:
1135                 errmsgs.append(_("@info",
1136                                  "%(mtype)s markup: invalid attribute "
1137                                  "'%(attr)s' to tag '%(tag)s'.",
1138                                  mtype=g.xmlfmt, attr=attr, tag=tag))
1139             else:
1140                 avlint = elspec.avlints.get(attr)
1141                 if avlint and not avlint(aval):
1142                     errmsgs.append(_("@info",
1143                                      "%(mtype)s markup: invalid value "
1144                                      "'%(val)s' to attribute '%(attr)s'.",
1145                                      mtype=g.xmlfmt, val=aval, attr=attr))
1146 
1147     # Check presence of mandatory attributes.
1148     if elspec.mattrs is not None:
1149         for attr in elspec.mattrs:
1150             if attr not in attrs:
1151                 errmsgs.append(_("@info",
1152                                  "%(mtype)s markup: missing mandatory attribute "
1153                                  "'%(attr)s' to tag '%(tag)s'.",
1154                                  mtype=g.xmlfmt, attr=attr, tag=tag))
1155 
1156     # Check proper parentage.
1157     if g.tagstack:
1158         ptag = g.tagstack[-1]
1159         pelspec = g.spec.get(ptag)
1160         if (    pelspec is not None and pelspec.stags is not None
1161             and tag not in pelspec.stags
1162         ):
1163             errmsgs.append(_("@info",
1164                              "%(mtype)s markup: tag '%(tag1)s' cannot be "
1165                              "a subtag of '%(tag2)s'.",
1166                              mtype=g.xmlfmt, tag1=tag, tag2=ptag))
1167 
1168     # Record element stack.
1169     g.tagstack.append(tag)
1170 
1171     for errmsg in errmsgs:
1172         span = _make_span(g.text, g.parser.CurrentLineNumber,
1173                           g.parser.CurrentColumnNumber + 1, errmsg)
1174         g.spans.append(span)
1175 
1176 
1177 def _handler_default (text):
1178 
1179     g = _g_xml_l1
1180 
1181     if g.ents is not None and text.startswith('&') and text.endswith(';'):
1182         ent = text[1:-1]
1183         errmsg = None
1184         if ent.startswith("#"):
1185             if nument_to_char(ent) is None:
1186                 errmsg = _("@info",
1187                            "%(mtype)s markup: invalid numeric "
1188                            "entity '%(ent)s'.",
1189                            mtype=g.xmlfmt, ent=ent)
1190         elif ent not in g.ents and ent not in xml_entities:
1191             nearents = [] #difflib.get_close_matches(ent, g.ents)
1192             if nearents:
1193                 if len(nearents) > 5: # do not overwhelm message
1194                     fmtents = format_item_list(nearents[:5], incmp=True)
1195                 else:
1196                     fmtents = format_item_list(nearents)
1197                 errmsg = _("@info",
1198                            "%(mtype)s markup: unknown entity '%(ent)s' "
1199                            "(suggestions: %(entlist)s).",
1200                            mtype=g.xmlfmt, ent=ent, entlist=fmtents)
1201             else:
1202                 errmsg = _("@info",
1203                            "%(mtype)s markup: unknown entity '%(ent)s'.",
1204                            mtype=g.xmlfmt, ent=ent)
1205 
1206         if errmsg is not None:
1207             span = _make_span(g.text, g.parser.CurrentLineNumber,
1208                               g.parser.CurrentColumnNumber + 1, errmsg)
1209             g.spans.append(span)
1210 
1211 
1212 # Text to fetch from the reported error position in XML stream.
1213 _near_xml_error_rx = re.compile(r"\W*[\w:.-]*[^\w\s>]*(\s*>)?", re.U)
1214 
1215 def _make_span (text, lno, col, errmsg):
1216 
1217     # Find problematic position.
1218     clno = 1
1219     p = 0
1220     while clno < lno:
1221         p = text.find("\n", p)
1222         if p < 0:
1223             break
1224         p += 1
1225         clno += 1
1226     if p < 0:
1227         return (0, len(text))
1228 
1229     # Scoop some reasonable nearby text.
1230     m = _near_xml_error_rx.match(text, p + col - 1)
1231     if not m:
1232         return (0, len(text), errmsg)
1233     start, end = m.span()
1234     while text[start].isalnum():
1235         if start == 0:
1236             break
1237         start -= 1
1238 
1239     return (start, end, errmsg)
1240 
1241 
1242 _entname_rx = re.compile(r"^([\w:][\w\d.:-]*)$", re.U)
1243 
1244 def _validate_xml_entdef (text, xmlfmt):
1245 
1246     state = "void"
1247     pos = 0
1248     tlen = len(text)
1249     errmsg = None
1250     dhead = "!ENTITY"
1251     def next_nws (pos):
1252         while pos < tlen and text[pos].isspace():
1253             pos += 1
1254         return pos
1255     def next_ws (pos, ows=()):
1256         while pos < tlen and not text[pos].isspace() and text[pos] not in ows:
1257             pos += 1
1258         return pos
1259     errend = lambda: (_("@info",
1260                         "%(mtype)s markup: premature end of entity definition.",
1261                         mtype=xmlfmt),
1262                       tlen)
1263     while True:
1264         if state == "void":
1265             pos = next_nws(pos)
1266             if pos == tlen:
1267                 break
1268             elif text[pos] != "<":
1269                 errmsg = _("@info",
1270                            "%(mtype)s markup: expected opening angle bracket "
1271                            "in entity definition.",
1272                            mtype=xmlfmt)
1273                 pos1 = pos + 1
1274             else:
1275                 pos += 1
1276                 state = "head"
1277 
1278         elif state == "head":
1279             pos = next_nws(pos)
1280             if pos == tlen:
1281                 errmsg, pos1 = errend()
1282             else:
1283                 pos1 = next_ws(pos)
1284                 head = text[pos:pos1]
1285                 if head != dhead:
1286                     errmsg = _("@info",
1287                                "%(mtype)s markup: expected '%(keyword)s' "
1288                                "in entity definition.",
1289                                mtype=xmlfmt, keyword=dhead)
1290                 else:
1291                     pos = pos1
1292                     state = "name"
1293 
1294         elif state == "name":
1295             pos = next_nws(pos)
1296             pos1 = next_ws(pos, ("'", "\""))
1297             name = text[pos:pos1]
1298             if not _entname_rx.match(name):
1299                 errmsg = _("@info",
1300                            "%(mtype)s markup: invalid entity name '%(name)s' "
1301                            "in entity definition.",
1302                            mtype=xmlfmt, name=name)
1303             else:
1304                 pos = pos1
1305                 state = "value"
1306 
1307         elif state == "value":
1308             pos = next_nws(pos)
1309             if pos == tlen:
1310                 errmsg, pos1 = errend()
1311             elif text[pos] not in ("'", "\""):
1312                 errmsg = _("@info",
1313                            "%(mtype)s markup: expected opening quote "
1314                            "(ASCII single or double) in entity definition.",
1315                            mtype=xmlfmt)
1316                 pos1 = pos + 1
1317             else:
1318                 quote = text[pos]
1319                 pos1 = text.find(quote, pos + 1)
1320                 if pos1 < 0:
1321                     errmsg = _("@info",
1322                                "%(mtype)s markup: unclosed entity value "
1323                                "in entity definition.",
1324                                mtype=xmlfmt)
1325                     pos1 = tlen
1326                 else:
1327                     value = text[pos + 1:pos1]
1328                     # FIXME: Validate value? Does not have to be valid
1329                     # on its own, in principle.
1330                     pos = pos1 + 1
1331                     state = "tail"
1332 
1333         elif state == "tail":
1334             pos = next_nws(pos)
1335             if pos == tlen:
1336                 errmsg, pos1 = errend()
1337             elif text[pos] != ">":
1338                 errmsg = _("@info",
1339                            "%(mtype)s markup: expected closing angle bracket "
1340                            "in entity definition.",
1341                            mtype=xmlfmt)
1342                 pos1 = pos + 1
1343             else:
1344                 pos += 1
1345                 state = "void"
1346 
1347         if errmsg:
1348             break
1349 
1350     spans = []
1351     if errmsg:
1352         if pos1 is None:
1353             pos1 = pos
1354         spans = [(pos, pos1, errmsg)]
1355 
1356     return spans
1357 
1358 
1359 def check_xml (strict=False, entities={}, mkeyw=None):
1360     """
1361     Check general XML markup in translation [hook factory].
1362 
1363     Text is only checked to be well-formed XML, and possibly also whether
1364     encountered entities are defined. Markup errors are reported to stdout.
1365 
1366     C{msgstr} can be either checked only if the C{msgid} is valid itself,
1367     or regardless of the validity of the original. This is governed by the
1368     C{strict} parameter.
1369 
1370     Entities in addition to XML's default (C{&lt;}, etc.)
1371     may be provided using the C{entities} parameter.
1372     Several types of values with different semantic are possible:
1373       - if C{entities} is C{None}, unknown entities are ignored on checking
1374       - if string, it is understood as a general function evaluation
1375         L{request<getfunc.get_result_ireq>},
1376         and its result expected to be (name, value) dictionary-like object
1377       - otherwise, C{entities} is considered to be a (name, value) dictionary
1378 
1379     If a message has L{sieve flag<pology.sieve.parse_sieve_flags>}
1380     C{no-check-markup}, the check is skipped for that message.
1381     If one or several markup keywords are given as C{mkeyw} parameter,
1382     check is skipped for all messages in a catalog which does not report
1383     one of the given keywords by its L{markup()<catalog.Catalog.markup>}
1384     method. See L{set_markup()<catalog.Catalog.set_markup>} for list of
1385     markup keywords recognized at the moment.
1386 
1387     @param strict: whether to require valid C{msgstr} even if C{msgid} is not
1388     @type strict: bool
1389     @param entities: additional entities to consider as known
1390     @type entities: C{None}, dict, or string
1391     @param mkeyw: markup keywords for taking catalogs into account
1392     @type mkeyw: string or list of strings
1393 
1394     @return: type S3C hook
1395     @rtype: C{(msgstr, msg, cat) -> numerr}
1396     """
1397 
1398     return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, False)
1399 
1400 
1401 def check_xml_sp (strict=False, entities={}, mkeyw=None):
1402     """
1403     Like L{check_xml}, except that erroneous spans are returned
1404     instead of reporting problems to stdout [hook factory].
1405 
1406     @return: type V3C hook
1407     @rtype: C{(msgstr, msg, cat) -> spans}
1408     """
1409 
1410     return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, True)
1411 
1412 
1413 # Worker for C{check_xml*} hook factories.
1414 def _check_xml_w (check, strict, entities, mkeyw, spanrep,
1415                   ignctxt=(), ignid=(), ignctxtsw=(), ignidsw=()):
1416 
1417     if mkeyw is not None:
1418         if isinstance(mkeyw, str):
1419             mkeyw = [mkeyw]
1420         mkeyw = set(mkeyw)
1421 
1422     # Lazy-evaluated data.
1423     ldata = {}
1424     def eval_ldata ():
1425         ldata["entities"] = _get_entities(entities)
1426 
1427     def checkf (msgstr, msg, cat):
1428 
1429         if (    mkeyw is not None
1430             and not mkeyw.intersection(cat.markup() or set())
1431         ):
1432             return [] if spanrep else 0
1433 
1434         if (   msg.msgctxt in ignctxt
1435             or msg.msgid in ignid
1436             or (msg.msgctxt is not None and msg.msgctxt.startswith(ignctxtsw))
1437             or msg.msgid.startswith(ignidsw)
1438         ):
1439             return [] if spanrep else 0
1440 
1441         if not ldata:
1442             eval_ldata()
1443         entities = ldata["entities"]
1444 
1445         if (   flag_no_check_markup in manc_parse_flag_list(msg, "|")
1446             or (    not strict
1447                 and (   check(msg.msgid, ents=entities)
1448                      or check(msg.msgid_plural or "", ents=entities)))
1449         ):
1450             return [] if spanrep else 0
1451         spans = check(msgstr, ents=entities)
1452         if spanrep:
1453             return spans
1454         else:
1455             for span in spans:
1456                 if span[2:]:
1457                     report_on_msg(span[2], msg, cat)
1458             return len(spans)
1459 
1460     return checkf
1461 
1462 
1463 # Cache for loaded entities, by entity specification string,
1464 # to speed up when several markup hooks are using the same setup.
1465 _loaded_entities_cache = {}
1466 
1467 def _get_entities (entspec):
1468 
1469     if not isinstance(entspec, str):
1470         return entspec
1471 
1472     entities = _loaded_entities_cache.get(entspec)
1473     if entities is not None:
1474         return entities
1475 
1476     entities = get_result_ireq(entspec)
1477 
1478     _loaded_entities_cache[entspec] = entities
1479     return entities
1480 
1481 
1482 _docbook4_l1 = None
1483 
1484 def validate_docbook4_l1 (text, ents=None):
1485     """
1486     Validate Docbook 4.x markup in text against L{level1<collect_xml_spec_l1>}
1487     specification.
1488 
1489     Markup definition is extended to include C{<placeholder-N/>} elements,
1490     which C{xml2po} uses to segment text when extracting markup documents
1491     into PO templates.
1492 
1493     See L{validate_xml_l1} for description of the C{ents} parameter
1494     and the return value.
1495 
1496     @param text: text to check
1497     @type text: string
1498     @param ents: set of known entities (in addition to default)
1499     @type ents: sequence
1500 
1501     @returns: erroneous spans in the text
1502     @rtype: list of (int, int, string) tuples
1503     """
1504 
1505     global _docbook4_l1
1506     if _docbook4_l1 is None:
1507         specpath = os.path.join(datadir(), "spec", "docbook4.l1")
1508         _docbook4_l1 = collect_xml_spec_l1(specpath)
1509 
1510     xmlfmt = _("@item markup type", "Docbook4")
1511     return validate_xml_l1(text, spec=_docbook4_l1, xmlfmt=xmlfmt, ents=ents)
1512 
1513 
1514 _db4_meta_msgctxt = set((
1515 ))
1516 _db4_meta_msgid = set((
1517     "translator-credits",
1518 ))
1519 _db4_meta_msgid_sw = (
1520     "@@image:",
1521 )
1522 
1523 def check_docbook4 (strict=False, entities={}, mkeyw=None):
1524     """
1525     Check XML markup in translations of Docbook 4.x catalogs [hook factory].
1526 
1527     See L{check_xml} for description of parameters.
1528 
1529     @return: type S3C hook
1530     @rtype: C{(msgstr, msg, cat) -> numerr}
1531     """
1532 
1533     return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, False,
1534                         ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt,
1535                         ignidsw=_db4_meta_msgid_sw)
1536 
1537 
1538 def check_docbook4_sp (strict=False, entities={}, mkeyw=None):
1539     """
1540     Like L{check_docbook4}, except that erroneous spans are returned
1541     instead of reporting problems to stdout [hook factory].
1542 
1543     @return: type V3C hook
1544     @rtype: C{(msgstr, msg, cat) -> spans}
1545     """
1546 
1547     return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, True,
1548                         ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt,
1549                         ignidsw=_db4_meta_msgid_sw)
1550 
1551 
1552 def check_docbook4_msg (strict=False, entities={}, mkeyw=None):
1553     """
1554     Check for any known problem in translation in messages
1555     in Docbook 4.x catalogs [hook factory].
1556 
1557     Currently performed checks:
1558       - Docbook markup
1559       - cross-message insertion placeholders
1560 
1561     See L{check_xml} for description of parameters.
1562 
1563     @return: type V4A hook
1564     @rtype: C{(msg, cat) -> parts}
1565     """
1566 
1567     check_markup = check_docbook4_sp(strict, entities, mkeyw)
1568 
1569     def checkf (msg, cat):
1570 
1571         hl = []
1572         for i in range(len(msg.msgstr)):
1573             spans = []
1574             spans.extend(check_markup(msg.msgstr[i], msg, cat))
1575             spans.extend(check_placeholder_els(msg.msgid, msg.msgstr[i]))
1576             if spans:
1577                 hl.append(("msgstr", i, spans))
1578         return hl
1579 
1580     return checkf
1581 
1582 
1583 _entpath_html = os.path.join(datadir(), "spec", "html.entities")
1584 html_entities = read_entities(_entpath_html)
1585 
1586 _html_l1 = None
1587 
1588 def validate_html_l1 (text, ents=None):
1589     """
1590     Validate HTML markup in text against L{level1<collect_xml_spec_l1>}
1591     specification.
1592 
1593     At the moment, this function can only check HTML markup if well-formed
1594     in the XML sense, although HTML allows omission of some closing tags.
1595 
1596     See L{validate_xml_l1} for description of the C{ents} parameter
1597     and the return value.
1598 
1599     @param text: text to check
1600     @type text: string
1601     @param ents: set of known entities (in addition to default)
1602     @type ents: sequence
1603 
1604     @returns: erroneous spans in the text
1605     @rtype: list of (int, int, string) tuples
1606     """
1607 
1608     global _html_l1
1609     if _html_l1 is None:
1610         specpath = os.path.join(datadir(), "spec", "html.l1")
1611         _html_l1 = collect_xml_spec_l1(specpath)
1612 
1613     if ents is not None:
1614         ents = Multidict([ents, html_entities])
1615 
1616     xmlfmt = _("@item markup type", "HTML")
1617     return validate_xml_l1(text, spec=_html_l1, xmlfmt=xmlfmt, ents=ents,
1618                            accelamp=True, casesens=False)
1619 
1620 
1621 def check_html (strict=False, entities={}, mkeyw=None):
1622     """
1623     Check HTML markup in translations [hook factory].
1624 
1625     See L{check_xml} for description of parameters.
1626     See notes on checking HTML markup to L{validate_html_l1}.
1627 
1628     @return: type S3C hook
1629     @rtype: C{(msgstr, msg, cat) -> numerr}
1630     """
1631 
1632     return _check_xml_w(validate_html_l1, strict, entities, mkeyw, False)
1633 
1634 
1635 def check_html_sp (strict=False, entities={}, mkeyw=None):
1636     """
1637     Like L{check_html}, except that erroneous spans are returned
1638     instead of reporting problems to stdout [hook factory].
1639 
1640     @return: type V3C hook
1641     @rtype: C{(msgstr, msg, cat) -> spans}
1642     """
1643 
1644     return _check_xml_w(validate_html_l1, strict, entities, mkeyw, True)
1645 
1646 
1647 _qtrich_l1 = None
1648 
1649 def validate_qtrich_l1 (text, ents=None):
1650     """
1651     Validate Qt rich-text markup in text against L{level1<collect_xml_spec_l1>}
1652     specification.
1653 
1654     At the moment, this function can only check Qt rich-text if well-formed
1655     in the XML sense, although Qt rich-text allows HTML-type omission of
1656     closing tags.
1657 
1658     See L{validate_xml_l1} for description of the C{ents} parameter
1659     and the return value.
1660 
1661     @param text: text to check
1662     @type text: string
1663     @param ents: set of known entities (in addition to default)
1664     @type ents: sequence
1665 
1666     @returns: erroneous spans in the text
1667     @rtype: list of (int, int, string) tuples
1668     """
1669 
1670     global _qtrich_l1
1671     if _qtrich_l1 is None:
1672         specpath = os.path.join(datadir(), "spec", "qtrich.l1")
1673         _qtrich_l1 = collect_xml_spec_l1(specpath)
1674 
1675     if ents is not None:
1676         ents = Multidict([ents, html_entities])
1677 
1678     xmlfmt = _("@item markup type", "Qt-rich")
1679     return validate_xml_l1(text, spec=_qtrich_l1, xmlfmt=xmlfmt, ents=ents,
1680                            accelamp=True, casesens=False)
1681 
1682 
1683 def check_qtrich (strict=False, entities={}, mkeyw=None):
1684     """
1685     Check Qt rich-text markup in translations [hook factory].
1686 
1687     See L{check_xml} for description of parameters.
1688     See notes on checking Qt rich-text to L{validate_qtrich_l1}.
1689 
1690     @return: type S3C hook
1691     @rtype: C{(msgstr, msg, cat) -> numerr}
1692     """
1693 
1694     return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, False)
1695 
1696 
1697 def check_qtrich_sp (strict=False, entities={}, mkeyw=None):
1698     """
1699     Like L{check_qtrich}, except that erroneous spans are returned
1700     instead of reporting problems to stdout [hook factory].
1701 
1702     @return: type V3C hook
1703     @rtype: C{(msgstr, msg, cat) -> spans}
1704     """
1705 
1706     return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, True)
1707 
1708 
1709 _entpath_kuit = os.path.join(datadir(), "spec", "kuit.entities")
1710 kuit_entities = read_entities(_entpath_kuit)
1711 
1712 _kuit_l1 = None
1713 
1714 def validate_kuit_l1 (text, ents=None):
1715     """
1716     Validate KUIT markup in text against L{level1<collect_xml_spec_l1>}
1717     specification.
1718 
1719     KUIT is the semantic markup for user interface in KDE4.
1720 
1721     See L{validate_xml_l1} for description of the C{ents} parameter
1722     and the return value.
1723 
1724     @param text: text to check
1725     @type text: string
1726     @param ents: set of known entities (in addition to default)
1727     @type ents: sequence
1728 
1729     @returns: erroneous spans in the text
1730     @rtype: list of (int, int, string) tuples
1731     """
1732 
1733     global _kuit_l1
1734     if _kuit_l1 is None:
1735         specpath = os.path.join(datadir(), "spec", "kuit.l1")
1736         _kuit_l1 = collect_xml_spec_l1(specpath)
1737 
1738     if ents is not None:
1739         ents = Multidict([ents, kuit_entities])
1740 
1741     xmlfmt = _("@item markup type", "KUIT")
1742     return validate_xml_l1(text, spec=_kuit_l1, xmlfmt=xmlfmt, ents=ents,
1743                            accelamp=True)
1744 
1745 
1746 _kde4_l1 = None
1747 _kde4_ents = None
1748 
1749 def validate_kde4_l1 (text, ents=None):
1750     """
1751     Validate markup in texts used in KDE4 GUI.
1752 
1753     KDE4 GUI texts may contain both Qt rich-text and KUIT markup,
1754     even mixed in the same text.
1755 
1756     See L{validate_xml_l1} for description of the C{ents} parameter
1757     and the return value.
1758 
1759     @param text: text to check
1760     @type text: string
1761     @param ents: set of known entities (in addition to default)
1762     @type ents: sequence
1763 
1764     @returns: erroneous spans in the text
1765     @rtype: list of (int, int, string) tuples
1766     """
1767 
1768     global _kde4_l1, _kde4_ents
1769     if _kde4_l1 is None:
1770         _kde4_l1 = {}
1771         spath1 = os.path.join(datadir(), "spec", "qtrich.l1")
1772         _kde4_l1.update(collect_xml_spec_l1(spath1))
1773         spath2 = os.path.join(datadir(), "spec", "kuit.l1")
1774         _kde4_l1.update(collect_xml_spec_l1(spath2))
1775         _kde4_ents = {}
1776         _kde4_ents.update(html_entities)
1777         _kde4_ents.update(kuit_entities)
1778 
1779     if ents is not None:
1780         ents = Multidict([ents, _kde4_ents])
1781 
1782     xmlfmt = _("@item markup type", "KDE4")
1783     return validate_xml_l1(text, spec=_kde4_l1, xmlfmt=xmlfmt, ents=ents,
1784                            accelamp=True, casesens=False)
1785 
1786 
1787 def check_kde4 (strict=False, entities={}, mkeyw=None):
1788     """
1789     Check XML markup in translations of KDE4 UI catalogs [hook factory].
1790 
1791     See L{check_xml} for description of parameters.
1792 
1793     @return: type S3C hook
1794     @rtype: C{(msgstr, msg, cat) -> numerr}
1795     """
1796 
1797     return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, False)
1798 
1799 
1800 def check_kde4_sp (strict=False, entities={}, mkeyw=None):
1801     """
1802     Like L{check_kde4}, except that erroneous spans are returned
1803     instead of reporting problems to stdout [hook factory].
1804 
1805     @return: type V3C hook
1806     @rtype: C{(msgstr, msg, cat) -> spans}
1807     """
1808 
1809     return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, True)
1810 
1811 
1812 _pango_l1 = None
1813 
1814 def validate_pango_l1 (text, ents=None):
1815     """
1816     Validate Pango markup in text against L{level1<collect_xml_spec_l1>}
1817     specification.
1818 
1819     See L{validate_xml_l1} for description of the C{ents} parameter
1820     and the return value.
1821 
1822     @param text: text to check
1823     @type text: string
1824     @param ents: set of known entities (in addition to default)
1825     @type ents: sequence
1826 
1827     @returns: erroneous spans in the text
1828     @rtype: list of (int, int, string) tuples
1829     """
1830 
1831     global _pango_l1
1832     if _pango_l1 is None:
1833         specpath = os.path.join(datadir(), "spec", "pango.l1")
1834         _pango_l1 = collect_xml_spec_l1(specpath)
1835 
1836     if ents is not None:
1837         ents = Multidict([ents, html_entities])
1838 
1839     xmlfmt = _("@item markup type", "Pango")
1840     return validate_xml_l1(text, spec=_pango_l1, xmlfmt=xmlfmt, ents=ents,
1841                            accelamp=True, casesens=False)
1842 
1843 
1844 def check_pango (strict=False, entities={}, mkeyw=None):
1845     """
1846     Check XML markup in translations of Pango UI catalogs [hook factory].
1847 
1848     See L{check_xml} for description of parameters.
1849 
1850     @return: type S3C hook
1851     @rtype: C{(msgstr, msg, cat) -> numerr}
1852     """
1853 
1854     return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, False)
1855 
1856 
1857 def check_pango_sp (strict=False, entities={}, mkeyw=None):
1858     """
1859     Like L{check_pango}, except that erroneous spans are returned
1860     instead of reporting problems to stdout [hook factory].
1861 
1862     @return: type V3C hook
1863     @rtype: C{(msgstr, msg, cat) -> spans}
1864     """
1865 
1866     return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, True)
1867 
1868 
1869 
1870 
1871 _digits_dec = set("0123456789")
1872 _digits_hex = set("0123456789abcdefABCDEF")
1873 
1874 def nument_to_char (nument):
1875     """
1876     Convert numeric XML entity to character.
1877 
1878     Numeric XML entities can be decimal, C{&#DDDD;}, or hexadecimal,
1879     C{&#xHHHH;}, where C{D} and C{H} stand for number system's digits.
1880     4 digits is the maximum, but there can be less.
1881 
1882     If the entity cannot be converted to a character, for whatever reason,
1883     C{None} is reported.
1884 
1885     @param nument: numeric entity, with or without C{&} and C{;}
1886     @type nument: string
1887 
1888     @return: character represented by the entity
1889     @rtype: string or None
1890     """
1891 
1892     if nument[:1] == "&":
1893         nument = nument[1:-1]
1894 
1895     if nument[:1] != "#":
1896         return None
1897 
1898     if nument[1:2] == "x":
1899         known_digits = _digits_hex
1900         numstr = nument[2:]
1901         base = 16
1902     else:
1903         known_digits = _digits_dec
1904         numstr = nument[1:]
1905         base = 10
1906 
1907     if len(numstr) > 4 or len(numstr) < 1:
1908         return None
1909 
1910     unknown_digits = set(numstr).difference(known_digits)
1911     if unknown_digits:
1912         return None
1913 
1914     return chr(int(numstr, base))
1915 
1916 
1917 def validate_xmlents (text, ents={}, default=False, numeric=False):
1918     """
1919     Check whether XML-like entities in the text are among known.
1920 
1921     The text does not have to be XML markup as such.
1922     No XML parsing is performed, only the raw search for XML-like entities.
1923 
1924     @param text: text with entities to check
1925     @type text: string
1926     @param ents: known entities
1927     @type ents: sequence
1928     @param default: whether default XML entities are allowed (C{&amp;}, etc.)
1929     @type default: bool
1930     @param numeric: whether numeric character entities are allowed
1931     @type numeric: bool
1932 
1933     @returns: erroneous spans in the text
1934     @rtype: list of (int, int, string) tuples
1935     """
1936 
1937     spans = []
1938 
1939     p = 0
1940     while True:
1941         p = text.find("&", p)
1942         if p < 0:
1943             break
1944         pp = p
1945         m = _entity_rx.match(text, p)
1946         if m:
1947             p = m.end()
1948             ent = m.group(1)
1949             errmsg = None
1950             if numeric and ent.startswith("#"):
1951                 if nument_to_char(ent) is None:
1952                     errmsg = _("@info",
1953                                "Invalid numeric entity '%(ent)s'.",
1954                                ent=ent)
1955             elif ent not in ents and (not default or ent not in xml_entities):
1956                 nearents = [] #difflib.get_close_matches(ent, ents)
1957                 if nearents:
1958                     if len(nearents) > 5: # do not overwhelm message
1959                         fmtents = format_item_list(nearents[:5], incmp=True)
1960                     else:
1961                         fmtents = format_item_list(nearents)
1962                     errmsg = _("@info",
1963                                "Unknown entity '%(ent)s' "
1964                                "(suggestions: %(entlist)s).",
1965                                ent=ent, entlist=fmtents)
1966                 else:
1967                     errmsg = _("@info",
1968                                "Unknown entity '%(ent)s'.",
1969                                ent=ent)
1970 
1971             if errmsg is not None:
1972                 spans.append((pp, p, errmsg))
1973         else:
1974             p += 1
1975 
1976     return spans
1977 
1978 
1979 def check_xmlents (strict=False, entities={}, mkeyw=None,
1980                    default=False, numeric=False):
1981     """
1982     Check existence of XML entities in translations [hook factory].
1983 
1984     See L{check_xml} for description of parameters C{strict}, C{entities},
1985     and C{mkeyw}. See L{validate_xmlents} for parameters C{default} and
1986     C{numeric}, and for general notes on checking entities.
1987 
1988     @return: type S3C hook
1989     @rtype: C{(msgstr, msg, cat) -> numerr}
1990     """
1991 
1992     def check (text, ents):
1993         return validate_xmlents(text, ents, default=default, numeric=numeric)
1994 
1995     return _check_xml_w(check, strict, entities, mkeyw, False)
1996 
1997 
1998 def check_xmlents_sp (strict=False, entities={}, mkeyw=None,
1999                       default=False, numeric=False):
2000     """
2001     Like L{check_xmlents}, except that erroneous spans are returned
2002     instead of reporting problems to stdout [hook factory].
2003 
2004     @return: type V3C hook
2005     @rtype: C{(msgstr, msg, cat) -> spans}
2006     """
2007 
2008     def check (text, ents):
2009         return validate_xmlents(text, ents, default=default, numeric=numeric)
2010 
2011     return _check_xml_w(check, strict, entities, mkeyw, True)
2012 
2013 
2014 _placeholder_el_rx = re.compile(r"<\s*placeholder-(\d+)\s*/\s*>")
2015 
2016 def check_placeholder_els (orig, trans):
2017     """
2018     Check if sets of C{<placeholder-N/>} elements are matching between
2019     original and translated text.
2020 
2021     C{<placeholder-N/>} elements are added into text by C{xml2po},
2022     for finer segmentation of markup documents extracted into PO templates.
2023 
2024     See L{validate_xml_l1} for description of the return value.
2025 
2026     @param orig: original text
2027     @type orig: string
2028     @param trans: translated text
2029     @type trans: string
2030 
2031     @returns: erroneous spans in translation
2032     @rtype: list of (int, int, string) tuples
2033     """
2034 
2035     spans = []
2036 
2037     orig_plnums = set()
2038     for m in _placeholder_el_rx.finditer(orig):
2039         orig_plnums.add(m.group(1))
2040     trans_plnums = set()
2041     for m in _placeholder_el_rx.finditer(trans):
2042         trans_plnums.add(m.group(1))
2043 
2044     missing_plnums = list(orig_plnums.difference(trans_plnums))
2045     extra_plnums = list(trans_plnums.difference(orig_plnums))
2046     if missing_plnums:
2047         tags = "".join(["<placeholder-%s/>" % x for x in missing_plnums])
2048         errmsg = _("@info",
2049                    "Missing placeholder tags in translation: %(taglist)s.",
2050                    taglist=format_item_list(tags))
2051         spans.append((0, 0, errmsg))
2052     elif extra_plnums: # do not report both, single glitch may cause them
2053         tags = "".join(["<placeholder-%s/>" % x for x in extra_plnums])
2054         errmsg = _("@info",
2055                    "Superfluous placeholder tags in translation: %(taglist)s.",
2056                    taglist=format_item_list(tags))
2057         spans.append((0, 0, errmsg))
2058 
2059     return spans
2060