File indexing completed on 2024-05-12 05:47:03

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Various checks for translations into Serbian.
0005 
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009 
0010 import re
0011 
0012 from pology import _, n_
0013 from pology.diff import adapt_spans
0014 from pology.msgreport import warning_on_msg
0015 
0016 # ----------------------------------------
0017 # Checks for presence of naked Latin segments.
0018 
0019 # Directive head for manual GUI references in docs.
0020 _gui_man_dir = "~%"
0021 
0022 # Directive head for explicit wrapping to ignore Latin.
0023 _lat_wrap_dir = "~!"
0024 
0025 # - segments not to be considered naked
0026 _no_check_lat_rxs = (
0027     # - explicitly wrapped to ignore
0028     re.compile(_lat_wrap_dir + r"(.)(.*?)\1", re.U|re.I|re.S),
0029     # - format directives
0030     re.compile(r"%(\(\w+\))? ?\d?\$?[\d.]*[a-z]+", re.U|re.I), # C-like
0031     re.compile(r"%\d+", re.U|re.I), # Qt-like, must come after C-like
0032     # - text within these tags
0033     re.compile(r"<\s*(%s)\b.*?\b\1\s*>" % "|".join("""
0034         bcode command envar filename icode shortcut placeholder style code tt
0035         literal screen option keycap userinput systemitem prompt function
0036         foreignphrase varname programlisting token markup parameter keysym
0037         methodname replaceable sgmltag arg classname type package errorcode
0038     """.split()), re.U|re.I|re.S),
0039     # - some tags are requested without attributes, as otherwise
0040     # Latin-content is allowed inside attributes only.
0041     re.compile(r"<\s*(%s)\s*>\b.*?\b\1\s*>" % "|".join("""
0042         email link
0043     """.split()), re.U|re.I|re.S),
0044     # - all tags (must come after the above text removed by tags)
0045     re.compile(r"<.*?>", re.U|re.I),
0046     # - entities
0047     re.compile(r"&[\w_:][\w\d._:-]*;", re.U|re.I),
0048     # - command line options
0049     re.compile(r"(?<!\w)--?\w[\w-]*", re.U|re.I),
0050     # - hex numbers
0051     re.compile(r"0x[\dabcdef]*", re.U|re.I),
0052     # - alternatives directives
0053     re.compile(r"~@(.)(.*?)\1(.*?)\1", re.U|re.I|re.S),
0054     # - extension filter, e.g. "*.png|PNG files"
0055     re.compile(r"^.*\*\..*\|", re.U|re.I),
0056     # - URLs and web links
0057     re.compile(r"\S+://\S*[\w&=]", re.U),
0058     re.compile(r"\w{3,}(\.[\w-]{2,})+", re.U),
0059     # - wiki stuff
0060     re.compile(r"\[\[[^\]]*(\||\])", re.U|re.I),
0061     re.compile(r"\[[^\s]*", re.U|re.I),
0062     re.compile(r"\{\{.*?(\||\}\})", re.U|re.I),
0063     # - double escapings
0064     re.compile(r"\\[nt]"),
0065 )
0066 _no_check_lat_origui_rxs = (
0067     # - automatic by tags
0068     re.compile(r"<\s*(gui[a-z]+)\b.*?\b\1\s*>", re.U|re.I|re.S), # Docbook
0069     re.compile(r"<\s*(interface)\b.*?\b\1\s*>", re.U|re.I|re.S), # KUIT
0070     # - manually wrapped
0071     re.compile(_gui_man_dir + r"(.)(.*?)\1", re.U|re.I|re.S),
0072 )
0073 
0074 # Warn on naked-Latin if this matches.
0075 _naked_latin_rx = re.compile(r"[a-z][a-z\W]*", re.U|re.I)
0076 
0077 # Messages to skip by tags in auto comments.
0078 _auto_cmnt_tag_rx = re.compile(r"^\s*Tag:\s*(%s)\s*$" % "|".join("""
0079     filename envar programlisting screen command option userinput cmdsynopsis
0080     email errorcode
0081 """.split()), re.U|re.I)
0082 
0083 # The hook worker.
0084 def _naked_latin_w (msgstr, msg, cat, origui=False, sideeffect=False):
0085 
0086     # Avoid meta-messages.
0087     if (   msg.msgctxt in ("EMAIL OF TRANSLATORS",)
0088         or (    cat.name.endswith(".desktop")
0089             and msg.msgctxt in ("Keywords", "Query"))
0090     ):
0091         if sideeffect:
0092             return 0
0093         else:
0094             return []
0095 
0096     # Avoid specially tagged messages.
0097     for auto_cmnt in msg.auto_comment:
0098         if _auto_cmnt_tag_rx.search(auto_cmnt):
0099             if sideeffect:
0100                 return 0
0101             else:
0102                 return []
0103 
0104     # Eliminate all no-check segments.
0105     stripped_msgstr = msgstr
0106     if origui: # must come before tag removal
0107         for rx in _no_check_lat_origui_rxs:
0108             stripped_msgstr = rx.sub("", stripped_msgstr)
0109     for rx in _no_check_lat_rxs:
0110         stripped_msgstr = rx.sub("", stripped_msgstr)
0111 
0112     matches = list(_naked_latin_rx.finditer(stripped_msgstr))
0113     if sideeffect:
0114         # Report if any Latin text remained in stripped msgstr.
0115         for m in matches:
0116             warning_on_msg(_("@info",
0117                              "Naked Latin segment '%(snippet)s'.",
0118                              snippet=m.group(0)), msg, cat)
0119         return len(matches)
0120     else:
0121         # Collect and adapt offending spans.
0122         spans = [m.span() for m in matches]
0123         spans = adapt_spans(msgstr, stripped_msgstr, spans, merge=False)
0124         return spans
0125 
0126 
0127 def naked_latin (msgstr, msg, cat):
0128     """
0129     Report spans of Latin letters outside of sanctioned contexts
0130     [type V3C hook].
0131 
0132     Latin segments are allowed within:
0133       - some XML-like tags, e.g. C{tt}, C{code}, C{email}, C{envar}, etc.
0134       - XML-like entities, e.g. C{&foo;} or C{&#x00a0;}
0135       - format directives starting with %-character
0136       - command line options, e.g. C{-o} or C{--foo-bar}
0137       - hexadecimal number starting with C{0x}
0138       - extension filters, e.g. C{"*.png|ПНГ слике"}
0139       - alternative directives {~@/.../.../}, e.g. C{~@/Делфин/Dolphin/}
0140       - links in wiki markup, e.g. C{"...на [http://foo.org страни Фуа]"}
0141       - templates in wiki markup, e.g. C{"{{note|Обавезно проверите...}}"}
0142       - explicit wrapping C{~!/.../}, e.g. C{"...наредбом ~!/grep/..."}
0143 
0144     @return: annotated spans
0145     """
0146 
0147     return _naked_latin_w(msgstr, msg, cat)
0148 
0149 
0150 def naked_latin_origui (msgstr, msg, cat):
0151     """
0152     Like C{naked_latin}, but allowing original UI references which are
0153     supposed to be automatically resolved [type V3C hook].
0154 
0155     Original UI references are given:
0156       - within XML-like tags: C{gui*} (C{guilabel}, C{guimenu}, etc.),
0157         C{interface};
0158         e.g. C{"...кликните на <guibutton>Scramble Reactor</guibutton> да..."}
0159       - manually wrapped by C{~%/.../},
0160         e.g. C{"...кликните на ~%/Scramble Reactor/ да..."}
0161 
0162     @return: annotated spans
0163     """
0164 
0165     return _naked_latin_w(msgstr, msg, cat, origui=True)
0166 
0167 
0168 def naked_latin_se (msgstr, msg, cat):
0169     """
0170     Side-effect version of C{naked_latin}, issuing warnings to stderr
0171     [type S3C hook].
0172 
0173     @return: number of errors
0174     """
0175 
0176     return _naked_latin_w(msgstr, msg, cat, sideeffect=True)
0177 
0178 
0179 def naked_latin_origui_se (msgstr, msg, cat):
0180     """
0181     Side-effect version of C{naked_latin_origui}, issuing warnings to stderr
0182     [type S3C hook].
0183 
0184     @return: number of errors
0185     """
0186 
0187     return _naked_latin_w(msgstr, msg, cat, origui=True, sideeffect=True)
0188