File indexing completed on 2024-11-03 11:24:06

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Check validity of translation in catalogs of The Battle for Wesnoth.
0005 
0006 Documented in C{doc/user/sieving.docbook}.
0007 
0008 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0009 @license: GPLv3
0010 """
0011 
0012 import os
0013 import re
0014 
0015 from pology import _, n_
0016 from pology.report import report, format_item_list
0017 from pology.msgreport import report_on_msg_hl, report_msg_content
0018 from pology.msgreport import report_msg_to_lokalize
0019 from pology.sieve import add_param_poeditors
0020 from pology.sieve import SieveError
0021 from pology.message import MessageUnsafe
0022 from functools import reduce
0023 
0024 
0025 _ctxtsep = "^"
0026 
0027 
0028 def setup_sieve (p):
0029 
0030     p.set_desc(_("@info sieve discription",
0031     "Check validity of messages in catalogs of The Battle for Wesnoth."
0032     ))
0033     chnames = list(_known_checks.keys())
0034     chnames.sort()
0035     p.add_param("check", str, seplist=True,
0036                 metavar=_("@info sieve parameter value placeholder",
0037                           "KEYWORD,..."),
0038                 desc=_("@info sieve parameter discription",
0039     "Run only this check instead of all (currently available: %(chklist)s). "
0040     "Several checks can be specified as a comma-separated list.",
0041     chklist=format_item_list(chnames)
0042     ))
0043     p.add_param("showmsg", bool, defval=False,
0044                 desc=_("@info sieve parameter discription",
0045     "Also show the full message that had some problems."
0046     ))
0047     add_param_poeditors(p)
0048 
0049 
0050 class Sieve (object):
0051 
0052     def __init__ (self, params):
0053 
0054         self.selected_checks = None
0055         if params.check is not None:
0056             unknown_checks = []
0057             for chname in params.check:
0058                 if chname not in _known_checks:
0059                     unknown_checks.append(chname)
0060             if unknown_checks:
0061                 fmtchecks = format_item_list(unknown_checks)
0062                 raise SieveError(
0063                     _("@info",
0064                       "Unknown checks selected: %(chklist)s.",
0065                       chklist=fmtchecks))
0066             self.selected_checks = set(params.check)
0067 
0068         self.showmsg = params.showmsg
0069         self.lokalize = params.lokalize
0070 
0071         # Indicators to the caller:
0072         self.caller_sync = False # no need to sync catalogs to the caller
0073         self.caller_monitored = False # no need for monitored messages
0074 
0075         self.nproblems = 0
0076 
0077 
0078     def process_header (self, hdr, cat):
0079 
0080         def set_checks (names):
0081             self.current_checks = []
0082             if self.selected_checks is not None:
0083                 names = set(names).intersection(self.selected_checks)
0084             for name in names:
0085                 self.current_checks.append(_known_checks[name])
0086 
0087         # Determine applicable checks by characteristic message.
0088         # Ugly, but no catalog name and nothing in header.
0089         if cat.select_by_key(None, "en"):
0090             set_checks(["docbook"])
0091         elif cat.select_by_key(None, "wesnothd"):
0092             set_checks(["man"])
0093         else:
0094             set_checks(["ctxtsep", "interp", "wml", "pango", "space"])
0095 
0096 
0097     def process (self, msg, cat):
0098 
0099         if not msg.translated:
0100             return
0101 
0102         highlight = []
0103 
0104         # Convert embedded to proper context.
0105         if _ctxtsep in msg.msgid:
0106             p = msg.msgid.find(_ctxtsep)
0107             msg = MessageUnsafe(msg) # should not modify original message
0108             msg.msgctxt = msg.msgid[:p]
0109             msg.msgid = msg.msgid[p + len(_ctxtsep):]
0110 
0111         for check in self.current_checks:
0112             self.nproblems += check(msg, cat, False, highlight)
0113 
0114         if highlight:
0115             if self.showmsg:
0116                 report_msg_content(msg, cat, highlight=highlight,
0117                                    delim=("-" * 20))
0118             else:
0119                 report_on_msg_hl(highlight, msg, cat)
0120             if self.lokalize:
0121                 report_msg_to_lokalize(msg, cat, highlight)
0122 
0123 
0124     def finalize (self):
0125 
0126         if self.nproblems > 0:
0127             msg = n_("@info:progress BfW stands for \"Battle for Wesnoth\"",
0128                      "Found %(num)d problem in BfW translations.",
0129                      "Found %(num)d problems in BfW translations.",
0130                      num=self.nproblems)
0131             report("===== " + msg)
0132 
0133 
0134 # --------------------------------------
0135 # Check for mistranslated contexts.
0136 
0137 def _check_ctxtsep (msg, cat, strict, hl):
0138 
0139     nproblems = 0
0140     for i in range(len(msg.msgstr)):
0141         p = msg.msgstr[i].find(_ctxtsep)
0142         if p >= 0:
0143             hl.append(("msgstr", i,
0144                        [(p, p + len(_ctxtsep),
0145                          _("@info", "Stray context separator."))]))
0146             nproblems += 1
0147 
0148     return nproblems
0149 
0150 
0151 # --------------------------------------
0152 # Check for congruence of interpolations.
0153 
0154 def _check_interp (msg, cat, strict, hl):
0155 
0156     def match_for_index (index, interps_orig, n_can_miss=0):
0157         nproblems = 0
0158         interps_trans = _collect_interps(msg.msgstr[index])
0159         if interps_orig != interps_trans:
0160             interps_missing = interps_orig.difference(interps_trans)
0161             # Eliminate from check interpolations explicitly ignored.
0162             for cmnt in [x.strip() for x in msg.manual_comment]:
0163                 if cmnt.startswith("ignore-interpolations:"):
0164                     interps = cmnt[cmnt.find(":") + 1:].split()
0165                     for interp in interps:
0166                         interp = interp.strip()
0167                         if not interp.startswith("$"):
0168                             interp = "$%s" % interp
0169                         if interp in interps_missing:
0170                             interps_missing.remove(interp)
0171             interps_unknown = interps_trans.difference(interps_orig)
0172             if interps_missing and len(interps_missing) > n_can_miss:
0173                 vfmt = format_item_list(interps_missing)
0174                 hl.append(("msgstr", index,
0175                            [(None, None,
0176                              _("@info",
0177                                "Missing interpolations: %(interplist)s.",
0178                                interplist=vfmt))]))
0179                 nproblems += 1
0180             elif interps_unknown:
0181                 vfmt = format_item_list(interps_unknown)
0182                 hl.append(("msgstr", index,
0183                            [(None, None,
0184                              _("@info",
0185                                "Unknown interpolations: %(interplist)s.",
0186                                interplist=vfmt))]))
0187                 nproblems += 1
0188         return nproblems
0189 
0190     nproblems = 0
0191     if msg.msgid_plural is None:
0192         interps_orig = _collect_interps(msg.msgid)
0193         nproblems += match_for_index(0, interps_orig)
0194     else:
0195         interps_orig = _collect_interps(msg.msgid_plural)
0196         indices_single = cat.plural_indices_single()
0197         for i in range(len(msg.msgstr)):
0198             nproblems += match_for_index(i, interps_orig,
0199                                          i in indices_single and 1 or 0)
0200 
0201     return nproblems
0202 
0203 
0204 _interp_rx = re.compile(r"\$\w+(?:\.\w+)*") # intentionally no re.U flag
0205 
0206 def _collect_interps (text):
0207 
0208     return set(_interp_rx.findall(text))
0209 
0210 
0211 # --------------------------------------
0212 # Check for WML validity.
0213 
0214 def _check_wml (msg, cat, strict, hl):
0215 
0216     if _detect_markup(msg, cat) != "wml":
0217         return 0
0218 
0219     # Validate WML in original and collect links.
0220     # If the original is not valid, do not check translation.
0221     spans_orig, links_orig = _check_wml_text(msg.msgid)
0222     if spans_orig:
0223         return 0
0224 
0225     nproblems = 0
0226     links_trans = set()
0227     for i in range(len(msg.msgstr)):
0228         spans, links = _check_wml_text(msg.msgstr[i])
0229         if spans:
0230             hl.append(("msgstr", i, spans))
0231             nproblems += len(spans)
0232         elif links != links_orig:
0233             links_missing = links_orig.difference(links)
0234             links_unknown = links.difference(links_orig)
0235             if links_missing:
0236                 vfmt = format_item_list(links_missing)
0237                 hl.append(("msgstr", i,
0238                            [(None, None,
0239                              _("@info",
0240                                "Missing links: %(linklist)s.",
0241                                linklist=vfmt))]))
0242                 nproblems += 1
0243             elif links_unknown:
0244                 vfmt = format_item_list(links_unknown)
0245                 hl.append(("msgstr", i,
0246                            [(None, None,
0247                              _("@info",
0248                                "Unknown links: %(linklist)s.",
0249                                linklist=vfmt))]))
0250                 nproblems += 1
0251 
0252     return nproblems
0253 
0254 
0255 _any_ws = re.compile(r"\s")
0256 
0257 def _is_tag (tag):
0258 
0259     return not _any_ws.search(tag)
0260 
0261 
0262 _known_tags = {
0263     "bold": {"text": True},
0264     "format": {"bold": False, "color": False, "font_size": False,
0265                "italic": False, "text": True},
0266     "header": {"text": True},
0267     "img": {"align": False, "float": False, "src": True},
0268     "italic": {"text": True},
0269     "jump": {"amount": False, "to": False},
0270     "ref": {"dst": True, "force": False, "text": True},
0271 }
0272 _bool_vals = set(["no", "yes"])
0273 _att_val_check = {
0274     "align" : lambda x: x in ["here", "left", "middle", "right"],
0275     "amount" : lambda x: x.isdigit(),
0276     "bold" : lambda x: x in _bool_vals,
0277     "color" : lambda x: x in ["black", "green", "red", "white", "yellow"],
0278     "dst" : lambda x: len(x) > 0,
0279     "float" : lambda x: x in _bool_vals,
0280     "font_size" : lambda x: x.isdigit(),
0281     "force" : lambda x: x in _bool_vals,
0282     "italic" : lambda x: x in _bool_vals,
0283     "src" : lambda x: len(x) > 0,
0284     "text" : lambda x: True,
0285     "to" : lambda x: bool(re.match(r"^[+-]\d+$", x)),
0286 }
0287 _link_atts = set(["dst", "src"])
0288 
0289 
0290 def _check_wml_text (text):
0291 
0292     spans = []
0293     links = set()
0294     p = 0
0295     while True:
0296         p = text.find("<", p)
0297         if p < 0:
0298             break
0299         p2 = text.find(">", p)
0300         if p2 < 0:
0301             spans.append((p, len(text),
0302                           _("@info", "End of string within tag.")))
0303             break
0304         tag = text[p + 1:p2]
0305         if not _is_tag(tag):
0306             spans.append((p, p2, _("@info",  "Invalid tag syntax.")))
0307             break
0308         if tag not in _known_tags:
0309             spans.append((p, p2, _("@info", "Unknown tag.")))
0310             break
0311         p3 = text.find("</", p2 + 1)
0312         if p3 < 0:
0313             spans.append((p - 1, p2 + 10, _("@info", "Unclosed tag.")))
0314             break
0315         p4 = text.find(">", p3)
0316         if p4 < 0:
0317             spans.append((p3, len(text),
0318                           _("@info", "Unterminated closing tag.")))
0319             break
0320         tag2 = text[p3 + 2:p4]
0321         # Any further errors do not terminate checking.
0322         p = p4 + 1 # start position for next loop
0323         if tag2 != tag:
0324             spans.append((p3, p4,
0325                           _("@info", "Mismatched opening and closing tags.")))
0326             continue
0327         spans_att, links_att = _check_wml_att(tag, text[p2 + 1:p3])
0328         spans.extend([(p2 + 1 + pi1, p2 + 1 + pi2, note)
0329                       for pi1, pi2, note in spans_att])
0330         links.update(links_att)
0331 
0332     return spans, links
0333 
0334 
0335 def _check_wml_att (tag, content):
0336 
0337     spans = []
0338     links = set()
0339     have_atts = set()
0340     lenc = len(content)
0341     p = 0
0342     while True:
0343         while p < lenc and content[p].isspace():
0344             p += 1
0345         if p >= lenc:
0346             break
0347         # Parse attribute.
0348         p2 = p
0349         while p2 < lenc and content[p2].isalpha():
0350             p2 += 1
0351         if p2 >= lenc:
0352             spans.append((p, lenc,
0353                           _("@info", "End of tag content within attribute.")))
0354             break
0355         att = content[p:p2]
0356         if att not in _known_tags[tag]:
0357             spans.append((p, p2 + 1,
0358                           _("@info",
0359                             "'%(attr)s' is not an attribute of "
0360                             "tag '%(tag)s'.", attr=att, tag=tag)))
0361             break
0362         if content[p2] != "=":
0363             spans.append((p, p2 + 1,
0364                          _("@info", "No equal sign after attribute.")))
0365             break
0366         if att in have_atts:
0367             spans.append((p, p2 + 1,
0368                           _("@info",
0369                             "Attribute '%(attr)s' repeated.", attr=att)))
0370             break
0371         have_atts.add(att)
0372         # Parse value.
0373         p3 = p2 + 1
0374         if content[p3:p3 + 1] == "'":
0375             terminator = "'"
0376             p3 += 1
0377         else:
0378             terminator = " "
0379         p4 = p3
0380         while p4 < lenc and content[p4] != terminator:
0381             if content[p4] == "\\": # an escape
0382                 p4 += 1
0383             p4 += 1
0384         val = content[p3:p4]
0385         if not _att_val_check[att](val):
0386             spans.append((p3, p4,
0387                           _("@info",
0388                             "Invalid value to attribute '%(attr)s'.",
0389                             attr=att)))
0390         if att in _link_atts:
0391             links.add(val)
0392         # Prepare next loop.
0393         p = p4 + 1
0394 
0395     if not spans:
0396         for att, mandatory in list(_known_tags[tag].items()):
0397             if mandatory and att not in have_atts:
0398                 spans.append((0, 0,
0399                               _("@info",
0400                                 "Missing mandatory attribute '%(attr)s'.",
0401                                 attr=att)))
0402 
0403     return spans, links
0404 
0405 
0406 # --------------------------------------
0407 # Check for Pango markup.
0408 
0409 from pology.markup import validate_pango_l1
0410 
0411 def _check_pango (msg, cat, strict, hl):
0412 
0413     if _detect_markup(msg, cat) != "pango":
0414         return 0
0415 
0416     # If the original is not valid, do not check translation.
0417     spans_orig = validate_pango_l1(msg.msgid)
0418     if spans_orig:
0419         return 0
0420 
0421     nproblems = 0
0422     for i in range(len(msg.msgstr)):
0423         spans = validate_pango_l1(msg.msgstr[i])
0424         if spans:
0425             hl.append(("msgstr", i, spans))
0426             nproblems += len(spans)
0427 
0428     return nproblems
0429 
0430 
0431 # --------------------------------------
0432 # Check for congruence of spaces.
0433 
0434 _langs_w_outspc = (
0435     "sr", "sr@latin", "de", "lt", "fr", "ru", "sk", "is",
0436 )
0437 
0438 def _check_space (msg, cat, strict, hl):
0439 
0440     # Check only for explicitly listed languages.
0441     if (cat.language() or cat.name) not in _langs_w_outspc:
0442         return 0
0443 
0444     # Check if explicitly stated in extracted comment
0445     # that outer space in original is significant.
0446     kw_outspcsig = "outer-space-significant"
0447     outspcsig = reduce(lambda s, x: s or kw_outspcsig in x.lower(),
0448                         msg.auto_comment, False)
0449 
0450     nproblems = 0
0451     haslead_o = msg.msgid.startswith(" ")
0452     hastail_o = msg.msgid.endswith(" ")
0453     tailnspc_o = msg.msgid.strip()[-1:]
0454     for i in range(len(msg.msgstr)):
0455         haslead_t = msg.msgstr[i].startswith(" ")
0456         hastail_t = msg.msgstr[i].endswith(" ")
0457 
0458         # Consider trailing space in original significant
0459         # if explicitly stated so, if it is preceded by colon,
0460         # or there was a leading space.
0461         if (    hastail_o and not hastail_t
0462             and (outspcsig or haslead_o or tailnspc_o in ":")
0463         ):
0464             hl.append(("msgstr", i, [(-1, -1,
0465                                       _("@info", "Missing trailing space."))]))
0466             nproblems += 1
0467 
0468         # Consider leading space always significant.
0469         if haslead_o and not haslead_t:
0470             hl.append(("msgstr", i, [(0, 0,
0471                                       _("@info", "Missing leading space."))]))
0472             nproblems += 1
0473 
0474         """
0475         Nah, usually invisible and yet frequent.
0476         # If original has no trailing space,
0477         # translation should also have none.
0478         if not hastail_o and hastail_t:
0479             hl.append(("msgstr", i, [(-1, -1, "extra trailing space")]))
0480             nproblems += 1
0481         """
0482 
0483         # If original has no leading space,
0484         # translation should also have none.
0485         if not haslead_o and haslead_t:
0486             hl.append(("msgstr", i, [(0, 0,
0487                                       _("@info", "Extra leading space."))]))
0488             nproblems += 1
0489 
0490     return nproblems
0491 
0492 
0493 # --------------------------------------
0494 # Check for Docbook markup.
0495 
0496 from pology.markup import check_docbook4_msg
0497 
0498 _check_dbmarkup_pt = [None]
0499 
0500 def _check_dbmarkup (msg, cat, strict, hl):
0501 
0502     if not _check_dbmarkup_pt[0]:
0503         _check_dbmarkup_pt[0] = check_docbook4_msg(strict=strict, entities=None)
0504 
0505     hl1 = _check_dbmarkup_pt[0](msg, cat)
0506     hl.extend(hl1)
0507     nproblems = sum(len(x[2]) for x in hl1)
0508 
0509     return nproblems
0510 
0511 
0512 # --------------------------------------
0513 # Check for man markup.
0514 
0515 def _check_man (msg, cat, strict, hl):
0516 
0517     # TODO.
0518 
0519     return 0
0520 
0521 
0522 # --------------------------------------
0523 # Map of all existing checks.
0524 
0525 _known_checks = {
0526     "ctxtsep": _check_ctxtsep,
0527     "interp": _check_interp,
0528     "wml": _check_wml,
0529     "pango": _check_pango,
0530     "space": _check_space,
0531     "docbook": _check_dbmarkup,
0532     "man": _check_man,
0533 }
0534 
0535 # --------------------------------------
0536 # Utilities.
0537 
0538 # Try to heuristically detect which type of markup is used in the message.
0539 # Detection is conservative: better report no markup, than wrong markup.
0540 
0541 from pology.markup import collect_xml_spec_l1
0542 from pology import datadir
0543 
0544 _tags_wml = _known_tags
0545 _specpath = os.path.join(datadir(), "spec", "pango.l1")
0546 _tags_pango = list(collect_xml_spec_l1(_specpath).keys())
0547 
0548 _first_tag_rx = re.compile(r"<\s*(\w+)[^>]*>", re.U)
0549 
0550 
0551 # Return keyword of markup detected in the text.
0552 def _detect_markup_in_text (text):
0553 
0554     m = _first_tag_rx.search(text)
0555     if m:
0556         tag = m.group(1)
0557         if tag in _tags_wml:
0558             return "wml"
0559         elif tag in _tags_pango:
0560             return "pango"
0561         else:
0562             return "unknown"
0563     else:
0564         return None
0565 
0566 
0567 # Return keyword of markup detected in the message.
0568 def _detect_markup (msg, cat):
0569 
0570     # First look into original text.
0571     # If no markup determined from there, look into translation.
0572     markup_type = _detect_markup_in_text(msg.msgid)
0573     if markup_type is None:
0574         markup_type = _detect_markup_in_text(msg.msgstr[0])
0575 
0576     return markup_type
0577