File indexing completed on 2024-10-27 11:34:21
0001 # -*- coding: utf-8 -*- 0002 0003 """ 0004 Spell-check translation using Enchant (U{http://www.abisource.com/projects/enchant/}). 0005 0006 Documented in C{doc/user/sieving.docbook}. 0007 0008 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0009 @license: GPLv3 0010 """ 0011 0012 import codecs 0013 from functools import cmp_to_key 0014 from locale import strcoll 0015 import os 0016 from pathlib import Path 0017 import re 0018 import tempfile 0019 from time import strftime 0020 0021 from pology import PologyError, datadir, _, n_ 0022 from pology.spell import flag_no_check_spell, elist_well_spelled 0023 from pology.colors import cjoin 0024 from pology.comments import manc_parse_list, manc_parse_flag_list 0025 import pology.config as cfg 0026 from pology.getfunc import get_hook_ireq 0027 from pology.msgreport import report_on_msg 0028 from pology.msgreport import report_msg_to_lokalize 0029 from pology.msgreport import spell_xml_error 0030 from pology.report import report, warning, format_item_list 0031 from pology.sieve import SieveError, SieveCatalogError 0032 from pology.split import proper_words 0033 from pology.sieve import add_param_spellcheck, add_param_poeditors 0034 0035 0036 def setup_sieve (p): 0037 0038 p.set_desc(_("@info sieve description", 0039 "Spell-check translation using Enchant." 0040 )) 0041 0042 p.add_param("provider", str, seplist=True, 0043 metavar=_("@info sieve parameter value placeholder", "NAME"), 0044 desc=_("@info sieve parameter description", 0045 "The spell-checking provider to use. " 0046 "Several provider can be given as comma-separated list." 0047 )) 0048 0049 add_param_spellcheck(p) 0050 0051 p.add_param("xml", str, 0052 metavar=_("@info sieve parameter value placeholder", "FILE"), 0053 desc=_("@info sieve parameter description", 0054 "Build XML report file at given path." 0055 )) 0056 0057 0058 class Sieve (object): 0059 0060 def __init__ (self, params): 0061 0062 cfgs = cfg.section("enchant") 0063 0064 self.providers = ( ",".join(params.provider or "") 0065 or cfgs.string("provider") 0066 or None) 0067 0068 self.lang = ( params.lang 0069 or cfgs.string("language") 0070 or None) 0071 0072 self.envs = params.env 0073 if self.envs is None and cfgs.string("environment") is not None: 0074 self.envs = cfgs.string("environment").split(",") 0075 if self.envs is None: 0076 self.envs = [] 0077 self.envs = [x.strip() for x in self.envs] 0078 0079 self.accel = params.accel 0080 0081 self.markup = params.markup 0082 0083 self.skip_rx = None 0084 if params.skip is not None: 0085 flags = re.U 0086 if not params.case: 0087 flags |= re.I 0088 self.skip_rx = re.compile(params.skip, flags) 0089 0090 self.pfilters = [[get_hook_ireq(x, abort=True), x] 0091 for x in (params.filter or [])] 0092 0093 self.suponly = params.suponly 0094 0095 self.words_only = params.list 0096 self.lokalize = params.lokalize 0097 0098 # File we are processing 0099 self.filename = "" 0100 # File used for the XML output, if requested 0101 self.xmlFile = None 0102 0103 # Langenv-dependent elements built along the way. 0104 self.checkers = {} 0105 self.word_lists = {} 0106 0107 # Tracking of unknown words. 0108 self.unknown_words = set() 0109 0110 # Indicators to the caller: 0111 self.caller_sync = False # no need to sync catalogs 0112 self.caller_monitored = False # no need for monitored messages 0113 0114 if params.xml: 0115 try: 0116 # TODO: create nice api to manage xml file in rules.py 0117 self.xmlFile = Path(params.xml).open("w") 0118 self.xmlFile.write('<?xml version="1.0" encoding="UTF-8"?>\n') 0119 self.xmlFile.write('<pos date="%s">\n' % strftime('%c')) 0120 except Exception as exc: 0121 warning(_("@info", 0122 "Cannot open file '%(file)s': %(ex)s. XML output " 0123 "disabled.", file=params.xml, ex=exc)) 0124 0125 0126 def process_header (self, hdr, cat): 0127 0128 # Check if the catalog itself states the language, and if yes, 0129 # create the language-dependent stuff if not already created 0130 # for this langenv. 0131 clang = self.lang or cat.language() 0132 if not clang: 0133 raise SieveCatalogError( 0134 _("@info", 0135 "Cannot determine language for catalog '%(file)s'.", 0136 file=cat.filename)) 0137 cenvs = self.envs or cat.environment() or [] 0138 ckey = (clang, tuple(cenvs)) 0139 if ckey not in self.checkers: 0140 # Get Pology's internal word list for this langenv. 0141 if clang not in self.word_lists: # may be in but None 0142 self.word_lists[ckey] = _compose_word_list(clang, cenvs) 0143 # Create spell-checker object. 0144 clang_mod = (self.suponly and [None] or [clang])[0] 0145 checker = _create_checker(self.providers, clang_mod, 0146 self.word_lists[ckey]) 0147 if not checker: 0148 raise SieveError( 0149 _("@info", 0150 "No spelling dictionary for language '%(lang)s' and " 0151 "provider '%(prov)s'.", 0152 lang=clang, prov=self.providers)) 0153 self.checkers[ckey] = checker 0154 0155 # Get language-dependent stuff. 0156 self.checker = self.checkers[ckey] 0157 0158 # Force explicitly given accelerators and markup. 0159 if self.accel is not None: 0160 cat.set_accelerator(self.accel) 0161 if self.markup is not None: 0162 cat.set_markup(self.markup) 0163 0164 # Close previous/open new XML section. 0165 if self.xmlFile: 0166 filename = Path(cat.filename).name 0167 # Close previous PO. 0168 if self.filename != "": 0169 self.xmlFile.write("</po>\n") 0170 self.filename = filename 0171 # Open new PO. 0172 poTag='<po name="%s">\n' % filename 0173 self.xmlFile.write(poTag) # Write to result 0174 0175 0176 def process (self, msg, cat): 0177 0178 if not msg.translated: 0179 return 0180 0181 failed_w_suggs = [] 0182 msgstr_cnt = 0 0183 0184 for msgstr in msg.msgstr: 0185 0186 # Skip message if explicitly requested. 0187 if flag_no_check_spell in manc_parse_flag_list(msg, "|"): 0188 continue 0189 0190 # Apply precheck filters. 0191 for pfilter, pfname in self.pfilters: 0192 try: # try as type F1A hook 0193 msgstr = pfilter(msgstr) 0194 except TypeError: 0195 try: # try as type F3* hook 0196 msgstr = pfilter(msgstr, msg, cat) 0197 except TypeError: 0198 raise SieveError( 0199 _("@info", 0200 "Cannot execute filter '%(filt)s'.", 0201 filt=pfname)) 0202 0203 # Split text into words. 0204 # TODO: See to use markup types somehow. 0205 words = proper_words(msgstr, True, cat.accelerator(), msg.format) 0206 0207 # Eliminate from checking words matching the skip regex. 0208 if self.skip_rx: 0209 words = [x for x in words if not self.skip_rx.search(x)] 0210 0211 # Eliminate from checking words explicitly listed as good. 0212 locally_ignored = manc_parse_list(msg, elist_well_spelled, ",") 0213 words = [x for x in words if x not in locally_ignored] 0214 0215 for word in words: 0216 if not self.checker.check(word): 0217 failed = True 0218 self.unknown_words.add(word) 0219 0220 if not self.words_only or self.lokalize: 0221 suggs = self.checker.suggest(word) 0222 incmp = False 0223 if len(suggs) > 5: # do not put out too many words 0224 suggs = suggs[:5] 0225 incmp = True 0226 failed_w_suggs.append((word, suggs)) 0227 0228 if not self.words_only: 0229 if self.xmlFile: 0230 xmlError = spell_xml_error(msg, cat, word, suggs, 0231 msgstr_cnt) 0232 self.xmlFile.writelines(xmlError) 0233 0234 if suggs: 0235 fsuggs = format_item_list(suggs, incmp=incmp) 0236 report_on_msg(_("@info", 0237 "Unknown word '%(word)s' " 0238 "(suggestions: %(wordlist)s).", 0239 word=word, wordlist=fsuggs), 0240 msg, cat) 0241 else: 0242 report_on_msg(_("@info", 0243 "Unknown word '%(word)s'.", 0244 word=word), 0245 msg, cat) 0246 0247 msgstr_cnt += 1 # Increase msgstr id count 0248 0249 if self.lokalize and failed_w_suggs: 0250 repls = [_("@label", "Spelling errors:")] 0251 for word, suggs in failed_w_suggs: 0252 if suggs: 0253 fmtsuggs=format_item_list(suggs, incmp=incmp) 0254 repls.append(_("@item", 0255 "%(word)s (suggestions: %(wordlist)s)", 0256 word=word, wordlist=fmtsuggs)) 0257 else: 0258 repls.append("%s" % (word)) 0259 report_msg_to_lokalize(msg, cat, cjoin(repls, "\n")) 0260 0261 0262 def finalize (self): 0263 0264 if self.unknown_words: 0265 if not self.words_only: 0266 nwords = len(self.unknown_words) 0267 msg = n_("@info:progress", 0268 "Encountered %(num)d unknown word.", 0269 "Encountered %(num)d unknown words.", 0270 num=nwords) 0271 report("===== " + msg) 0272 else: 0273 wlist = list(self.unknown_words) 0274 wlist.sort(key=cmp_to_key(strcoll)) 0275 report("\n".join(wlist)) 0276 0277 if self.xmlFile: 0278 self.xmlFile.write("</po>\n") 0279 self.xmlFile.write("</pos>\n") 0280 self.xmlFile.close() 0281 0282 0283 # Get checker object from Enchant. 0284 def _create_checker (providers, langtag, words): 0285 0286 try: 0287 import enchant 0288 except ImportError: 0289 pkgs = ["python-enchant"] 0290 raise PologyError(_("@info", 0291 "Python wrapper for Enchant not found, " 0292 "please install it (possible package names: " 0293 "%(pkglist)s).", 0294 pkglist=format_item_list(pkgs))) 0295 0296 if langtag is not None: 0297 try: 0298 broker = enchant.Broker() 0299 if providers is not None: 0300 broker.set_ordering(langtag, providers) 0301 checker = broker.request_dict(langtag) 0302 checker.check(".") 0303 except: 0304 checker = None 0305 else: 0306 tmpf = tempfile.NamedTemporaryFile() 0307 tmpf.close() 0308 checker = enchant.request_pwl_dict(tmpf.name) 0309 os.unlink(tmpf.name) 0310 0311 if checker: 0312 pname = checker.provider.name.split()[0].lower() 0313 need_upcasing = (pname in ("personal", "myspell")) 0314 for word in words or []: 0315 checker.add_to_session(word) 0316 if need_upcasing: 0317 checker.add_to_session(word[0].upper() + word[1:]) 0318 checker.add_to_session(word.upper()) 0319 return checker 0320 0321 0322 # Collect words from all internal word lists 0323 # available for given language+environment. 0324 def _compose_word_list (lang, envs): 0325 0326 # Collect all applicable word list files. 0327 wlist_files = set() 0328 for env in (envs or [""]): 0329 wlist_files.update(_get_word_list_files(lang, env)) 0330 wlist_files = list(wlist_files) 0331 wlist_files.sort() 0332 0333 # Read words. 0334 words = [] 0335 for wlist_file in wlist_files: 0336 words.extend(_read_wlist_aspell(wlist_file)) 0337 return words 0338 0339 0340 def _get_word_list_files (lang, env): 0341 0342 # Collect word list paths. 0343 wlist_files = set() 0344 spell_root = os.path.join(datadir(), "lang", lang, "spell") 0345 spell_subdir = os.path.join(".", (env or "")) 0346 while spell_subdir: 0347 spell_dir = os.path.join(spell_root, spell_subdir) 0348 if os.path.isdir(spell_dir): 0349 for item in os.listdir(spell_dir): 0350 if item.endswith(".aspell"): 0351 wlist_files.add(os.path.join(spell_dir, item)) 0352 spell_subdir = os.path.dirname(spell_subdir) 0353 return wlist_files 0354 0355 0356 # Read words from an Aspell word list. 0357 def _read_wlist_aspell (fname): 0358 0359 # Parse the header for encoding. 0360 0361 defenc = "UTF-8" 0362 fl = codecs.open(fname, "r", defenc) 0363 header = fl.readline() 0364 m = re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) 0365 if not m: 0366 warning(_("@info", 0367 "Malformed header in dictionary file '%(file)s'.", 0368 file=fname)) 0369 return [] 0370 enc = m.group(4) 0371 # Reopen in correct encoding if not the default. 0372 if enc.lower() != defenc.lower(): 0373 fl.close() 0374 fl = codecs.open(fname, "r", enc) 0375 0376 # Read words. 0377 words = [] 0378 for line in fl: 0379 word = line.strip() 0380 if word: 0381 words.append(word) 0382 return words