File indexing completed on 2024-05-12 05:47:02

0001 # -*- coding: UTF-8 -*
0002 
0003 """
0004 Catch inofficial ortography forms in Norwegian Bokmål translation.
0005 
0006 The check expects that the translation is plain text,
0007 i.e. that any markup has been removed from it beforehand;
0008 otherwise, problems masked by markup may not be reported.
0009 
0010 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0011 @license: GPLv3
0012 """
0013 
0014 import os
0015 import re
0016 import codecs
0017 
0018 from pology import datadir, _, n_
0019 from pology.fsops import collect_files_by_ext
0020 from pology.split import split_text
0021 
0022 
0023 def exclude_forms (dictnames):
0024     """
0025     Check for excluded ortography forms in translation [hook factory].
0026 
0027     @param dictnames: base names of files from which to collect excluded forms;
0028         file paths will be assembled as
0029         C{<datadir>/lang/nn/exclusion/<dictname>.dat}
0030     @type dictnames: <string*>
0031 
0032     @return: type V3C hook
0033     @rtype: C{(msgstr, msg, cat) -> spans}
0034     """
0035 
0036     phrases = _load_phrases(dictnames)
0037     maxwords = max([len(split_text(x)[0]) for x in phrases])
0038 
0039     def hook (msgstr, msg, cat):
0040 
0041         spans = []
0042 
0043         words, interps = split_text(msgstr)
0044         for phstart in range(len(words)):
0045             for phlen in range(min(maxwords, len(words) - phstart), 0, -1):
0046                 # Construct and test the following phrases:
0047                 # - with inner and trailing intersections
0048                 # - with leading and inner intersections
0049                 # - with inner intersections
0050                 for off1, off2 in ((1, 1), (0, 0), (1, 0)):
0051                     parts = []
0052                     if off1 == 0:
0053                         parts.append(interps[phstart])
0054                     parts.append(words[phstart])
0055                     for i in range(1, phlen):
0056                         parts.append(interps[phstart + i])
0057                         parts.append(words[phstart + i])
0058                     if off2 == 1:
0059                         parts.append(interps[phstart + phlen])
0060 
0061                     phrase = _normph("".join(parts))
0062                     if phrase in phrases:
0063                         p1 = (  sum(map(len, words[:phstart]))
0064                             + sum(map(len, interps[:phstart + off1])))
0065                         p2 = (  sum(map(len, words[:phstart + phlen]))
0066                             + sum(map(len, interps[:phstart + phlen + off2])))
0067                         emsg = _("@info",
0068                                 "Excluded form '%(word)s'.",
0069                                 word=msgstr[p1:p2].strip())
0070                         spans.append((p1, p2, emsg))
0071                         break
0072 
0073         return spans
0074 
0075     return hook
0076 
0077 
0078 def _load_phrases (dictnames):
0079 
0080     phrases = set()
0081 
0082     for dictname in dictnames:
0083         exfile = os.path.join(datadir(), "lang", "nb", "exclusion",
0084                               dictname + ".dat")
0085 
0086         phrases1 = codecs.open(exfile, "r", "UTF-8").read().split("\n")[:-1]
0087         phrases1 = list(map(_normph, phrases1))
0088         phrases.update(phrases1)
0089 
0090     return phrases
0091 
0092 
0093 _wsseq_rx = re.compile(r"\s{2,}", re.U)
0094 
0095 def _normph (phrase):
0096 
0097     return _wsseq_rx.sub(r" ", phrase.lower().strip())
0098