lang/nb/exclusion.py

0001 # -*- coding: UTF-8 -*
0002
0003 """
0004 Catch inofficial ortography forms in Norwegian Bokmål translation.
0005
0006 The check expects that the translation is plain text,
0007 i.e. that any markup has been removed from it beforehand;
0008 otherwise, problems masked by markup may not be reported.
0009
0010 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0011 @license: GPLv3
0012 """
0013
0014 import os
0015 import re
0016 import codecs
0017
0018 from pology import datadir, _, n_
0019 from pology.fsops import collect_files_by_ext
0020 from pology.split import split_text
0021
0022
0023 def exclude_forms (dictnames):
0024     """
0025     Check for excluded ortography forms in translation [hook factory].
0026
0027     @param dictnames: base names of files from which to collect excluded forms;
0028         file paths will be assembled as
0029         C{<datadir>/lang/nn/exclusion/<dictname>.dat}
0030     @type dictnames: <string*>
0031
0032     @return: type V3C hook
0033     @rtype: C{(msgstr, msg, cat) -> spans}
0034     """
0035
0036     phrases = _load_phrases(dictnames)
0037     maxwords = max([len(split_text(x)[0]) for x in phrases])
0038
0039     def hook (msgstr, msg, cat):
0040
0041         spans = []
0042
0043         words, interps = split_text(msgstr)
0044         for phstart in range(len(words)):
0045             for phlen in range(min(maxwords, len(words) - phstart), 0, -1):
0046                 # Construct and test the following phrases:
0047                 # - with inner and trailing intersections
0048                 # - with leading and inner intersections
0049                 # - with inner intersections
0050                 for off1, off2 in ((1, 1), (0, 0), (1, 0)):
0051                     parts = []
0052                     if off1 == 0:
0053                         parts.append(interps[phstart])
0054                     parts.append(words[phstart])
0055                     for i in range(1, phlen):
0056                         parts.append(interps[phstart + i])
0057                         parts.append(words[phstart + i])
0058                     if off2 == 1:
0059                         parts.append(interps[phstart + phlen])
0060
0061                     phrase = _normph("".join(parts))
0062                     if phrase in phrases:
0063                         p1 = (  sum(map(len, words[:phstart]))
0064                             + sum(map(len, interps[:phstart + off1])))
0065                         p2 = (  sum(map(len, words[:phstart + phlen]))
0066                             + sum(map(len, interps[:phstart + phlen + off2])))
0067                         emsg = _("@info",
0068                                 "Excluded form '%(word)s'.",
0069                                 word=msgstr[p1:p2].strip())
0070                         spans.append((p1, p2, emsg))
0071                         break
0072
0073         return spans
0074
0075     return hook
0076
0077
0078 def _load_phrases (dictnames):
0079
0080     phrases = set()
0081
0082     for dictname in dictnames:
0083         exfile = os.path.join(datadir(), "lang", "nb", "exclusion",
0084                               dictname + ".dat")
0085
0086         phrases1 = codecs.open(exfile, "r", "UTF-8").read().split("\n")[:-1]
0087         phrases1 = list(map(_normph, phrases1))
0088         phrases.update(phrases1)
0089
0090     return phrases
0091
0092
0093 _wsseq_rx = re.compile(r"\s{2,}", re.U)
0094
0095 def _normph (phrase):
0096
0097     return _wsseq_rx.sub(r" ", phrase.lower().strip())
0098