File indexing completed on 2024-05-12 05:47:02
0001 # -*- coding: UTF-8 -* 0002 0003 """ 0004 Catch inofficial ortography forms in Norwegian Bokmål translation. 0005 0006 The check expects that the translation is plain text, 0007 i.e. that any markup has been removed from it beforehand; 0008 otherwise, problems masked by markup may not be reported. 0009 0010 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0011 @license: GPLv3 0012 """ 0013 0014 import os 0015 import re 0016 import codecs 0017 0018 from pology import datadir, _, n_ 0019 from pology.fsops import collect_files_by_ext 0020 from pology.split import split_text 0021 0022 0023 def exclude_forms (dictnames): 0024 """ 0025 Check for excluded ortography forms in translation [hook factory]. 0026 0027 @param dictnames: base names of files from which to collect excluded forms; 0028 file paths will be assembled as 0029 C{<datadir>/lang/nn/exclusion/<dictname>.dat} 0030 @type dictnames: <string*> 0031 0032 @return: type V3C hook 0033 @rtype: C{(msgstr, msg, cat) -> spans} 0034 """ 0035 0036 phrases = _load_phrases(dictnames) 0037 maxwords = max([len(split_text(x)[0]) for x in phrases]) 0038 0039 def hook (msgstr, msg, cat): 0040 0041 spans = [] 0042 0043 words, interps = split_text(msgstr) 0044 for phstart in range(len(words)): 0045 for phlen in range(min(maxwords, len(words) - phstart), 0, -1): 0046 # Construct and test the following phrases: 0047 # - with inner and trailing intersections 0048 # - with leading and inner intersections 0049 # - with inner intersections 0050 for off1, off2 in ((1, 1), (0, 0), (1, 0)): 0051 parts = [] 0052 if off1 == 0: 0053 parts.append(interps[phstart]) 0054 parts.append(words[phstart]) 0055 for i in range(1, phlen): 0056 parts.append(interps[phstart + i]) 0057 parts.append(words[phstart + i]) 0058 if off2 == 1: 0059 parts.append(interps[phstart + phlen]) 0060 0061 phrase = _normph("".join(parts)) 0062 if phrase in phrases: 0063 p1 = ( sum(map(len, words[:phstart])) 0064 + sum(map(len, interps[:phstart + off1]))) 0065 p2 = ( sum(map(len, words[:phstart + phlen])) 0066 + sum(map(len, interps[:phstart + phlen + off2]))) 0067 emsg = _("@info", 0068 "Excluded form '%(word)s'.", 0069 word=msgstr[p1:p2].strip()) 0070 spans.append((p1, p2, emsg)) 0071 break 0072 0073 return spans 0074 0075 return hook 0076 0077 0078 def _load_phrases (dictnames): 0079 0080 phrases = set() 0081 0082 for dictname in dictnames: 0083 exfile = os.path.join(datadir(), "lang", "nb", "exclusion", 0084 dictname + ".dat") 0085 0086 phrases1 = codecs.open(exfile, "r", "UTF-8").read().split("\n")[:-1] 0087 phrases1 = list(map(_normph, phrases1)) 0088 phrases.update(phrases1) 0089 0090 return phrases 0091 0092 0093 _wsseq_rx = re.compile(r"\s{2,}", re.U) 0094 0095 def _normph (phrase): 0096 0097 return _wsseq_rx.sub(r" ", phrase.lower().strip()) 0098