File indexing completed on 2024-04-14 05:37:50

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Detect unwanted patterns in translation.
0005 
0006 @note: This module is deprecated.
0007 Use L{rules<pology.rules>} through C{check-rules} sieve instead.
0008 
0009 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0010 @license: GPLv3
0011 """
0012 
0013 import re
0014 import codecs
0015 
0016 from pology import _, n_
0017 from pology.comments import manc_parse_flag_list
0018 from pology.msgreport import report_on_msg, report_msg_content
0019 
0020 
0021 def bad_patterns (rxmatch=False, casesens=True, patterns=None, fromfiles=None):
0022     """
0023     Detect unwanted patterns in text [hook factory].
0024 
0025     Patterns can be given both as list of strings, and as a list of file
0026     paths containing patterns (in each file: one pattern per line,
0027     strip leading and trailing whitespace, skip empty lines, #-comments).
0028     Detected patterns are reported to stdout.
0029 
0030     If C{rxmatch} is C{False}, patterns are matched by plain substring search,
0031     otherwise as regular expressions.
0032     If C{casesens} is True, matching is case sensitive.
0033 
0034     If the message has pipe flag C{no-bad-patterns}, check is skipped.
0035 
0036     @param rxmatch: whether to take pattern as regular expression
0037     @type rxmatch: bool
0038     @param casesens: whether the match should be case-sensitive
0039     @type casesens: bool
0040     @param patterns: patterns to match the text
0041     @type patterns: list of strings
0042     @param fromfiles: file paths from which to read patterns
0043     @type fromfiles: list of strings
0044 
0045     @return: type S3A hook
0046     @rtype: C{(text, msg, cat)->numerr}
0047     """
0048 
0049     patterns_str = list(patterns or [])
0050     for file in fromfiles:
0051         patterns_str.extend(_load_patterns(file))
0052 
0053     patterns_cmp = _process_patterns(rxmatch=rxmatch, casesens=casesens,
0054                                      patterns=patterns_str)
0055 
0056     def hook (text, msg, cat):
0057         if _flag_no_bad_patterns in manc_parse_flag_list(msg, "|"):
0058             return 0
0059 
0060         indspans = _match_patterns(text, patterns_cmp)
0061         for pind, span in indspans:
0062             pstr = patterns_str[pind]
0063             report_on_msg(_("@info",
0064                             "Bad pattern '%(pattern)s' detected.",
0065                             pattern=pstr), msg, cat)
0066         return len(indspans)
0067 
0068     return hook
0069 
0070 
0071 def bad_patterns_msg (rxmatch=False, casesens=True,
0072                       patterns=None, fromfiles=None):
0073     """
0074     Detect unwanted patterns in translation [hook factory].
0075 
0076     Like L{bad_patterns}, but checks and reports on all C{msgstr}
0077     fields in the message.
0078 
0079     @return: type S4A hook
0080     @rtype: C{(msg, cat)->numerr}
0081     """
0082 
0083     return _bad_patterns_msg_w(rxmatch, casesens, patterns, fromfiles, False)
0084 
0085 
0086 def bad_patterns_msg_sp (rxmatch=False, casesens=True,
0087                          patterns=None, fromfiles=None):
0088     """
0089     Detect unwanted patterns in translation, report spans [hook factory].
0090 
0091     Like L{bad_patterns_msg}, but reports parts instead of writing to stdout.
0092 
0093     @return: type V4A hook
0094     @rtype: C{(msg, cat)->parts}
0095     """
0096 
0097     return _bad_patterns_msg_w(rxmatch, casesens, patterns, fromfiles, True)
0098 
0099 
0100 # Worker for bad_patterns_msg* hooks.
0101 def _bad_patterns_msg_w (rxmatch, casesens, patterns, fromfiles, partrep):
0102 
0103     patterns_str = list(patterns or [])
0104     for file in fromfiles or []:
0105         patterns_str.extend(_load_patterns(file))
0106 
0107     patterns_cmp = _process_patterns(rxmatch=rxmatch, casesens=casesens,
0108                                      patterns=patterns_str)
0109 
0110     def hook (msg, cat):
0111         if _flag_no_bad_patterns in manc_parse_flag_list(msg, "|"):
0112             return 0
0113 
0114         parts = []
0115         nbad = 0
0116         for i in range(len(msg.msgstr)):
0117             indspans = _match_patterns(msg.msgstr[i], patterns_cmp)
0118             spans = []
0119             for pind, span in indspans:
0120                 emsg = _("@info",
0121                          "Bad pattern '%(pattern)s' detected.",
0122                          pattern=patterns_str[pind])
0123                 spans.append(span + (emsg,))
0124                 nbad += 1
0125             if spans:
0126                 parts.append(("msgstr", i, spans))
0127 
0128         if partrep:
0129             return parts
0130         else:
0131             if parts:
0132                 report_msg_content(msg, cat, highlight=parts, delim=("-" * 20))
0133             return nbad
0134 
0135     return hook
0136 
0137 
0138 # Pipe flag used to manually prevent matching for a particular message.
0139 _flag_no_bad_patterns = "no-bad-patterns"
0140 
0141 
0142 # Load pattern string from the file:
0143 # one pattern per non-empty line in the file,
0144 # leading and trailing whitespace stripped,
0145 # #-comments possible.
0146 def _load_patterns (filepath):
0147 
0148     ifl = codecs.open(filepath, "r", "UTF-8")
0149 
0150     rem_cmnt_rx = re.compile(r"#.*")
0151     patterns = []
0152     for line in ifl.readlines():
0153         line = rem_cmnt_rx.sub("", line).strip()
0154         if line:
0155             patterns.append(line)
0156 
0157     return patterns
0158 
0159 
0160 # Process given list of pattern strings.
0161 # If rxmatch is True, patterns are compiled into regexes.
0162 # If casesens is False, re.I flag is used in regex compilation, or
0163 # if regex patterns are not requested, patterns are lower-cased.
0164 def _process_patterns (patterns, rxmatch=False, casesens=True):
0165 
0166     patterns_cmp = []
0167     if rxmatch:
0168         rx_flags = re.U
0169         if not casesens:
0170             rx_flags |= re.I
0171         for pattern in patterns:
0172             patterns_cmp.append(re.compile(pattern, rx_flags))
0173     else:
0174         for pattern in patterns:
0175             if not casesens:
0176                 patterns_cmp.append(pattern.lower())
0177             else:
0178                 patterns_cmp.append(pattern)
0179 
0180     return patterns_cmp
0181 
0182 
0183 # Try to match the text by all patterns in the list.
0184 # A pattern can be either a plain string for substring search,
0185 # or a compiled regular expression.
0186 # Returns a list of (pattern_index, span) tuples for patterns that matched.
0187 def _match_patterns (text, patterns):
0188 
0189     matched_patterns = []
0190     for i in range(len(patterns)):
0191         pattern = patterns[i]
0192 
0193         span = None
0194         if isinstance(pattern, str):
0195             p = text.find(pattern)
0196             if p >= 0:
0197                 span = (p, p + len(pattern))
0198         else:
0199             m = pattern.search(text)
0200             if m:
0201                 span = m.span()
0202 
0203         if span:
0204             matched_patterns.append((i, span))
0205 
0206     return matched_patterns
0207