File indexing completed on 2024-11-03 11:24:06
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Check validity of translation in catalogs of The Battle for Wesnoth. 0005 0006 Documented in C{doc/user/sieving.docbook}. 0007 0008 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0009 @license: GPLv3 0010 """ 0011 0012 import os 0013 import re 0014 0015 from pology import _, n_ 0016 from pology.report import report, format_item_list 0017 from pology.msgreport import report_on_msg_hl, report_msg_content 0018 from pology.msgreport import report_msg_to_lokalize 0019 from pology.sieve import add_param_poeditors 0020 from pology.sieve import SieveError 0021 from pology.message import MessageUnsafe 0022 from functools import reduce 0023 0024 0025 _ctxtsep = "^" 0026 0027 0028 def setup_sieve (p): 0029 0030 p.set_desc(_("@info sieve discription", 0031 "Check validity of messages in catalogs of The Battle for Wesnoth." 0032 )) 0033 chnames = list(_known_checks.keys()) 0034 chnames.sort() 0035 p.add_param("check", str, seplist=True, 0036 metavar=_("@info sieve parameter value placeholder", 0037 "KEYWORD,..."), 0038 desc=_("@info sieve parameter discription", 0039 "Run only this check instead of all (currently available: %(chklist)s). " 0040 "Several checks can be specified as a comma-separated list.", 0041 chklist=format_item_list(chnames) 0042 )) 0043 p.add_param("showmsg", bool, defval=False, 0044 desc=_("@info sieve parameter discription", 0045 "Also show the full message that had some problems." 0046 )) 0047 add_param_poeditors(p) 0048 0049 0050 class Sieve (object): 0051 0052 def __init__ (self, params): 0053 0054 self.selected_checks = None 0055 if params.check is not None: 0056 unknown_checks = [] 0057 for chname in params.check: 0058 if chname not in _known_checks: 0059 unknown_checks.append(chname) 0060 if unknown_checks: 0061 fmtchecks = format_item_list(unknown_checks) 0062 raise SieveError( 0063 _("@info", 0064 "Unknown checks selected: %(chklist)s.", 0065 chklist=fmtchecks)) 0066 self.selected_checks = set(params.check) 0067 0068 self.showmsg = params.showmsg 0069 self.lokalize = params.lokalize 0070 0071 # Indicators to the caller: 0072 self.caller_sync = False # no need to sync catalogs to the caller 0073 self.caller_monitored = False # no need for monitored messages 0074 0075 self.nproblems = 0 0076 0077 0078 def process_header (self, hdr, cat): 0079 0080 def set_checks (names): 0081 self.current_checks = [] 0082 if self.selected_checks is not None: 0083 names = set(names).intersection(self.selected_checks) 0084 for name in names: 0085 self.current_checks.append(_known_checks[name]) 0086 0087 # Determine applicable checks by characteristic message. 0088 # Ugly, but no catalog name and nothing in header. 0089 if cat.select_by_key(None, "en"): 0090 set_checks(["docbook"]) 0091 elif cat.select_by_key(None, "wesnothd"): 0092 set_checks(["man"]) 0093 else: 0094 set_checks(["ctxtsep", "interp", "wml", "pango", "space"]) 0095 0096 0097 def process (self, msg, cat): 0098 0099 if not msg.translated: 0100 return 0101 0102 highlight = [] 0103 0104 # Convert embedded to proper context. 0105 if _ctxtsep in msg.msgid: 0106 p = msg.msgid.find(_ctxtsep) 0107 msg = MessageUnsafe(msg) # should not modify original message 0108 msg.msgctxt = msg.msgid[:p] 0109 msg.msgid = msg.msgid[p + len(_ctxtsep):] 0110 0111 for check in self.current_checks: 0112 self.nproblems += check(msg, cat, False, highlight) 0113 0114 if highlight: 0115 if self.showmsg: 0116 report_msg_content(msg, cat, highlight=highlight, 0117 delim=("-" * 20)) 0118 else: 0119 report_on_msg_hl(highlight, msg, cat) 0120 if self.lokalize: 0121 report_msg_to_lokalize(msg, cat, highlight) 0122 0123 0124 def finalize (self): 0125 0126 if self.nproblems > 0: 0127 msg = n_("@info:progress BfW stands for \"Battle for Wesnoth\"", 0128 "Found %(num)d problem in BfW translations.", 0129 "Found %(num)d problems in BfW translations.", 0130 num=self.nproblems) 0131 report("===== " + msg) 0132 0133 0134 # -------------------------------------- 0135 # Check for mistranslated contexts. 0136 0137 def _check_ctxtsep (msg, cat, strict, hl): 0138 0139 nproblems = 0 0140 for i in range(len(msg.msgstr)): 0141 p = msg.msgstr[i].find(_ctxtsep) 0142 if p >= 0: 0143 hl.append(("msgstr", i, 0144 [(p, p + len(_ctxtsep), 0145 _("@info", "Stray context separator."))])) 0146 nproblems += 1 0147 0148 return nproblems 0149 0150 0151 # -------------------------------------- 0152 # Check for congruence of interpolations. 0153 0154 def _check_interp (msg, cat, strict, hl): 0155 0156 def match_for_index (index, interps_orig, n_can_miss=0): 0157 nproblems = 0 0158 interps_trans = _collect_interps(msg.msgstr[index]) 0159 if interps_orig != interps_trans: 0160 interps_missing = interps_orig.difference(interps_trans) 0161 # Eliminate from check interpolations explicitly ignored. 0162 for cmnt in [x.strip() for x in msg.manual_comment]: 0163 if cmnt.startswith("ignore-interpolations:"): 0164 interps = cmnt[cmnt.find(":") + 1:].split() 0165 for interp in interps: 0166 interp = interp.strip() 0167 if not interp.startswith("$"): 0168 interp = "$%s" % interp 0169 if interp in interps_missing: 0170 interps_missing.remove(interp) 0171 interps_unknown = interps_trans.difference(interps_orig) 0172 if interps_missing and len(interps_missing) > n_can_miss: 0173 vfmt = format_item_list(interps_missing) 0174 hl.append(("msgstr", index, 0175 [(None, None, 0176 _("@info", 0177 "Missing interpolations: %(interplist)s.", 0178 interplist=vfmt))])) 0179 nproblems += 1 0180 elif interps_unknown: 0181 vfmt = format_item_list(interps_unknown) 0182 hl.append(("msgstr", index, 0183 [(None, None, 0184 _("@info", 0185 "Unknown interpolations: %(interplist)s.", 0186 interplist=vfmt))])) 0187 nproblems += 1 0188 return nproblems 0189 0190 nproblems = 0 0191 if msg.msgid_plural is None: 0192 interps_orig = _collect_interps(msg.msgid) 0193 nproblems += match_for_index(0, interps_orig) 0194 else: 0195 interps_orig = _collect_interps(msg.msgid_plural) 0196 indices_single = cat.plural_indices_single() 0197 for i in range(len(msg.msgstr)): 0198 nproblems += match_for_index(i, interps_orig, 0199 i in indices_single and 1 or 0) 0200 0201 return nproblems 0202 0203 0204 _interp_rx = re.compile(r"\$\w+(?:\.\w+)*") # intentionally no re.U flag 0205 0206 def _collect_interps (text): 0207 0208 return set(_interp_rx.findall(text)) 0209 0210 0211 # -------------------------------------- 0212 # Check for WML validity. 0213 0214 def _check_wml (msg, cat, strict, hl): 0215 0216 if _detect_markup(msg, cat) != "wml": 0217 return 0 0218 0219 # Validate WML in original and collect links. 0220 # If the original is not valid, do not check translation. 0221 spans_orig, links_orig = _check_wml_text(msg.msgid) 0222 if spans_orig: 0223 return 0 0224 0225 nproblems = 0 0226 links_trans = set() 0227 for i in range(len(msg.msgstr)): 0228 spans, links = _check_wml_text(msg.msgstr[i]) 0229 if spans: 0230 hl.append(("msgstr", i, spans)) 0231 nproblems += len(spans) 0232 elif links != links_orig: 0233 links_missing = links_orig.difference(links) 0234 links_unknown = links.difference(links_orig) 0235 if links_missing: 0236 vfmt = format_item_list(links_missing) 0237 hl.append(("msgstr", i, 0238 [(None, None, 0239 _("@info", 0240 "Missing links: %(linklist)s.", 0241 linklist=vfmt))])) 0242 nproblems += 1 0243 elif links_unknown: 0244 vfmt = format_item_list(links_unknown) 0245 hl.append(("msgstr", i, 0246 [(None, None, 0247 _("@info", 0248 "Unknown links: %(linklist)s.", 0249 linklist=vfmt))])) 0250 nproblems += 1 0251 0252 return nproblems 0253 0254 0255 _any_ws = re.compile(r"\s") 0256 0257 def _is_tag (tag): 0258 0259 return not _any_ws.search(tag) 0260 0261 0262 _known_tags = { 0263 "bold": {"text": True}, 0264 "format": {"bold": False, "color": False, "font_size": False, 0265 "italic": False, "text": True}, 0266 "header": {"text": True}, 0267 "img": {"align": False, "float": False, "src": True}, 0268 "italic": {"text": True}, 0269 "jump": {"amount": False, "to": False}, 0270 "ref": {"dst": True, "force": False, "text": True}, 0271 } 0272 _bool_vals = set(["no", "yes"]) 0273 _att_val_check = { 0274 "align" : lambda x: x in ["here", "left", "middle", "right"], 0275 "amount" : lambda x: x.isdigit(), 0276 "bold" : lambda x: x in _bool_vals, 0277 "color" : lambda x: x in ["black", "green", "red", "white", "yellow"], 0278 "dst" : lambda x: len(x) > 0, 0279 "float" : lambda x: x in _bool_vals, 0280 "font_size" : lambda x: x.isdigit(), 0281 "force" : lambda x: x in _bool_vals, 0282 "italic" : lambda x: x in _bool_vals, 0283 "src" : lambda x: len(x) > 0, 0284 "text" : lambda x: True, 0285 "to" : lambda x: bool(re.match(r"^[+-]\d+$", x)), 0286 } 0287 _link_atts = set(["dst", "src"]) 0288 0289 0290 def _check_wml_text (text): 0291 0292 spans = [] 0293 links = set() 0294 p = 0 0295 while True: 0296 p = text.find("<", p) 0297 if p < 0: 0298 break 0299 p2 = text.find(">", p) 0300 if p2 < 0: 0301 spans.append((p, len(text), 0302 _("@info", "End of string within tag."))) 0303 break 0304 tag = text[p + 1:p2] 0305 if not _is_tag(tag): 0306 spans.append((p, p2, _("@info", "Invalid tag syntax."))) 0307 break 0308 if tag not in _known_tags: 0309 spans.append((p, p2, _("@info", "Unknown tag."))) 0310 break 0311 p3 = text.find("</", p2 + 1) 0312 if p3 < 0: 0313 spans.append((p - 1, p2 + 10, _("@info", "Unclosed tag."))) 0314 break 0315 p4 = text.find(">", p3) 0316 if p4 < 0: 0317 spans.append((p3, len(text), 0318 _("@info", "Unterminated closing tag."))) 0319 break 0320 tag2 = text[p3 + 2:p4] 0321 # Any further errors do not terminate checking. 0322 p = p4 + 1 # start position for next loop 0323 if tag2 != tag: 0324 spans.append((p3, p4, 0325 _("@info", "Mismatched opening and closing tags."))) 0326 continue 0327 spans_att, links_att = _check_wml_att(tag, text[p2 + 1:p3]) 0328 spans.extend([(p2 + 1 + pi1, p2 + 1 + pi2, note) 0329 for pi1, pi2, note in spans_att]) 0330 links.update(links_att) 0331 0332 return spans, links 0333 0334 0335 def _check_wml_att (tag, content): 0336 0337 spans = [] 0338 links = set() 0339 have_atts = set() 0340 lenc = len(content) 0341 p = 0 0342 while True: 0343 while p < lenc and content[p].isspace(): 0344 p += 1 0345 if p >= lenc: 0346 break 0347 # Parse attribute. 0348 p2 = p 0349 while p2 < lenc and content[p2].isalpha(): 0350 p2 += 1 0351 if p2 >= lenc: 0352 spans.append((p, lenc, 0353 _("@info", "End of tag content within attribute."))) 0354 break 0355 att = content[p:p2] 0356 if att not in _known_tags[tag]: 0357 spans.append((p, p2 + 1, 0358 _("@info", 0359 "'%(attr)s' is not an attribute of " 0360 "tag '%(tag)s'.", attr=att, tag=tag))) 0361 break 0362 if content[p2] != "=": 0363 spans.append((p, p2 + 1, 0364 _("@info", "No equal sign after attribute."))) 0365 break 0366 if att in have_atts: 0367 spans.append((p, p2 + 1, 0368 _("@info", 0369 "Attribute '%(attr)s' repeated.", attr=att))) 0370 break 0371 have_atts.add(att) 0372 # Parse value. 0373 p3 = p2 + 1 0374 if content[p3:p3 + 1] == "'": 0375 terminator = "'" 0376 p3 += 1 0377 else: 0378 terminator = " " 0379 p4 = p3 0380 while p4 < lenc and content[p4] != terminator: 0381 if content[p4] == "\\": # an escape 0382 p4 += 1 0383 p4 += 1 0384 val = content[p3:p4] 0385 if not _att_val_check[att](val): 0386 spans.append((p3, p4, 0387 _("@info", 0388 "Invalid value to attribute '%(attr)s'.", 0389 attr=att))) 0390 if att in _link_atts: 0391 links.add(val) 0392 # Prepare next loop. 0393 p = p4 + 1 0394 0395 if not spans: 0396 for att, mandatory in list(_known_tags[tag].items()): 0397 if mandatory and att not in have_atts: 0398 spans.append((0, 0, 0399 _("@info", 0400 "Missing mandatory attribute '%(attr)s'.", 0401 attr=att))) 0402 0403 return spans, links 0404 0405 0406 # -------------------------------------- 0407 # Check for Pango markup. 0408 0409 from pology.markup import validate_pango_l1 0410 0411 def _check_pango (msg, cat, strict, hl): 0412 0413 if _detect_markup(msg, cat) != "pango": 0414 return 0 0415 0416 # If the original is not valid, do not check translation. 0417 spans_orig = validate_pango_l1(msg.msgid) 0418 if spans_orig: 0419 return 0 0420 0421 nproblems = 0 0422 for i in range(len(msg.msgstr)): 0423 spans = validate_pango_l1(msg.msgstr[i]) 0424 if spans: 0425 hl.append(("msgstr", i, spans)) 0426 nproblems += len(spans) 0427 0428 return nproblems 0429 0430 0431 # -------------------------------------- 0432 # Check for congruence of spaces. 0433 0434 _langs_w_outspc = ( 0435 "sr", "sr@latin", "de", "lt", "fr", "ru", "sk", "is", 0436 ) 0437 0438 def _check_space (msg, cat, strict, hl): 0439 0440 # Check only for explicitly listed languages. 0441 if (cat.language() or cat.name) not in _langs_w_outspc: 0442 return 0 0443 0444 # Check if explicitly stated in extracted comment 0445 # that outer space in original is significant. 0446 kw_outspcsig = "outer-space-significant" 0447 outspcsig = reduce(lambda s, x: s or kw_outspcsig in x.lower(), 0448 msg.auto_comment, False) 0449 0450 nproblems = 0 0451 haslead_o = msg.msgid.startswith(" ") 0452 hastail_o = msg.msgid.endswith(" ") 0453 tailnspc_o = msg.msgid.strip()[-1:] 0454 for i in range(len(msg.msgstr)): 0455 haslead_t = msg.msgstr[i].startswith(" ") 0456 hastail_t = msg.msgstr[i].endswith(" ") 0457 0458 # Consider trailing space in original significant 0459 # if explicitly stated so, if it is preceded by colon, 0460 # or there was a leading space. 0461 if ( hastail_o and not hastail_t 0462 and (outspcsig or haslead_o or tailnspc_o in ":") 0463 ): 0464 hl.append(("msgstr", i, [(-1, -1, 0465 _("@info", "Missing trailing space."))])) 0466 nproblems += 1 0467 0468 # Consider leading space always significant. 0469 if haslead_o and not haslead_t: 0470 hl.append(("msgstr", i, [(0, 0, 0471 _("@info", "Missing leading space."))])) 0472 nproblems += 1 0473 0474 """ 0475 Nah, usually invisible and yet frequent. 0476 # If original has no trailing space, 0477 # translation should also have none. 0478 if not hastail_o and hastail_t: 0479 hl.append(("msgstr", i, [(-1, -1, "extra trailing space")])) 0480 nproblems += 1 0481 """ 0482 0483 # If original has no leading space, 0484 # translation should also have none. 0485 if not haslead_o and haslead_t: 0486 hl.append(("msgstr", i, [(0, 0, 0487 _("@info", "Extra leading space."))])) 0488 nproblems += 1 0489 0490 return nproblems 0491 0492 0493 # -------------------------------------- 0494 # Check for Docbook markup. 0495 0496 from pology.markup import check_docbook4_msg 0497 0498 _check_dbmarkup_pt = [None] 0499 0500 def _check_dbmarkup (msg, cat, strict, hl): 0501 0502 if not _check_dbmarkup_pt[0]: 0503 _check_dbmarkup_pt[0] = check_docbook4_msg(strict=strict, entities=None) 0504 0505 hl1 = _check_dbmarkup_pt[0](msg, cat) 0506 hl.extend(hl1) 0507 nproblems = sum(len(x[2]) for x in hl1) 0508 0509 return nproblems 0510 0511 0512 # -------------------------------------- 0513 # Check for man markup. 0514 0515 def _check_man (msg, cat, strict, hl): 0516 0517 # TODO. 0518 0519 return 0 0520 0521 0522 # -------------------------------------- 0523 # Map of all existing checks. 0524 0525 _known_checks = { 0526 "ctxtsep": _check_ctxtsep, 0527 "interp": _check_interp, 0528 "wml": _check_wml, 0529 "pango": _check_pango, 0530 "space": _check_space, 0531 "docbook": _check_dbmarkup, 0532 "man": _check_man, 0533 } 0534 0535 # -------------------------------------- 0536 # Utilities. 0537 0538 # Try to heuristically detect which type of markup is used in the message. 0539 # Detection is conservative: better report no markup, than wrong markup. 0540 0541 from pology.markup import collect_xml_spec_l1 0542 from pology import datadir 0543 0544 _tags_wml = _known_tags 0545 _specpath = os.path.join(datadir(), "spec", "pango.l1") 0546 _tags_pango = list(collect_xml_spec_l1(_specpath).keys()) 0547 0548 _first_tag_rx = re.compile(r"<\s*(\w+)[^>]*>", re.U) 0549 0550 0551 # Return keyword of markup detected in the text. 0552 def _detect_markup_in_text (text): 0553 0554 m = _first_tag_rx.search(text) 0555 if m: 0556 tag = m.group(1) 0557 if tag in _tags_wml: 0558 return "wml" 0559 elif tag in _tags_pango: 0560 return "pango" 0561 else: 0562 return "unknown" 0563 else: 0564 return None 0565 0566 0567 # Return keyword of markup detected in the message. 0568 def _detect_markup (msg, cat): 0569 0570 # First look into original text. 0571 # If no markup determined from there, look into translation. 0572 markup_type = _detect_markup_in_text(msg.msgid) 0573 if markup_type is None: 0574 markup_type = _detect_markup_in_text(msg.msgstr[0]) 0575 0576 return markup_type 0577