File indexing completed on 2024-10-27 08:25:07
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Spell-check translation using GNU Aspell (U{http://aspell.net/}). 0005 0006 Documented in C{doc/user/sieving.docbook}. 0007 0008 @author: Sébastien Renard <sebastien.renard@digitalfox.org> 0009 @license: GPLv3 0010 """ 0011 0012 from codecs import open 0013 import locale 0014 import os 0015 from os.path import abspath, basename, dirname, isfile, isdir, join 0016 import re 0017 import sys 0018 from time import strftime 0019 0020 from pology import datadir, _, n_ 0021 from pology.spell import flag_no_check_spell, elist_well_spelled 0022 from pology.colors import cjoin 0023 from pology.comments import manc_parse_list, manc_parse_flag_list 0024 import pology.config as cfg 0025 from pology.getfunc import get_hook_ireq 0026 from pology.msgreport import spell_error, spell_xml_error 0027 from pology.msgreport import report_msg_to_lokalize 0028 from pology.report import report, warning, format_item_list 0029 from pology.sieve import SieveError, SieveCatalogError 0030 from pology.split import proper_words 0031 from pology.sieve import add_param_spellcheck 0032 0033 0034 def setup_sieve (p): 0035 0036 p.set_desc(_("@info sieve discription", 0037 "Spell-check translation using Aspell." 0038 )) 0039 0040 add_param_spellcheck(p) 0041 0042 p.add_param("enc", str, 0043 metavar=_("@info sieve parameter value placeholder", 0044 "ENCODING"), 0045 desc=_("@info sieve parameter discription", 0046 "Encoding for text sent to Aspell." 0047 )) 0048 p.add_param("var", str, 0049 metavar=_("@info sieve parameter value placeholder", 0050 "VARIETY"), 0051 desc=_("@info sieve parameter discription", 0052 "Variety of the Aspell dictionary." 0053 )) 0054 p.add_param("xml", str, 0055 metavar=_("@info sieve parameter value placeholder", "FILE"), 0056 desc=_("@info sieve parameter discription", 0057 "Build XML report file at given path." 0058 )) 0059 p.add_param("simsp", bool, defval=False, 0060 desc=_("@info sieve parameter discription", 0061 "Split text into words in a simpler way (deprecated,)." 0062 )) 0063 0064 0065 class Sieve (object): 0066 """Process messages through the Aspell spell checker""" 0067 0068 def __init__ (self, params): 0069 0070 self.nmatch = 0 # Number of match for finalize 0071 self.unknownWords=None # If not None, only list of faulty word is display (to ease copy/paste into personal dictionary) 0072 self.filename="" # File name we are processing 0073 self.xmlFile=None # File handle to write XML output 0074 0075 # Build Aspell options. 0076 self.aspellOptions = {} 0077 0078 # - assume markup in messages (provide option to disable?) 0079 self.aspellOptions["mode"] = "sgml" 0080 # FIXME: In fact not needed? The words are sent parsed to checker. 0081 0082 self.lang = params.lang 0083 self.encoding = params.enc 0084 self.variety = params.var 0085 0086 cfgs = cfg.section("aspell") 0087 if not self.lang: 0088 self.lang = cfgs.string("language") 0089 if not self.encoding: 0090 self.encoding = cfgs.string("encoding") 0091 if not self.variety: 0092 self.variety = cfgs.string("variety") 0093 0094 self.loc_encoding = locale.getlocale()[1] 0095 if not self.encoding: 0096 self.encoding = self.loc_encoding 0097 if not self.encoding: 0098 self.encoding = "UTF-8" 0099 0100 self.encoding = self._encoding_for_aspell(self.loc_encoding) 0101 self.aspellOptions["lang"] = self.lang.encode(self.loc_encoding) if self.lang else None 0102 self.aspellOptions["encoding"] = self.encoding.encode(self.loc_encoding) 0103 if self.variety: 0104 self.aspellOptions["variety"] = self.variety.encode(self.loc_encoding) if self.variety else None 0105 0106 self.unknownWords = None 0107 if params.list: 0108 self.unknownWords = set() 0109 0110 if params.xml: 0111 xmlPath=params.xml 0112 if os.access(dirname(abspath(xmlPath)), os.W_OK): 0113 #TODO: create nice api to manage xml file and move it to rules.py 0114 self.xmlFile=open(xmlPath, "w", "utf-8") 0115 self.xmlFile.write('<?xml version="1.0" encoding="UTF-8"?>\n') 0116 self.xmlFile.write('<pos date="%s">\n' % strftime('%c')) 0117 else: 0118 warning(_("@info", 0119 "Cannot open file '%(file)s'. XML output disabled.", 0120 file=xmlPath)) 0121 0122 self.accel = params.accel 0123 self.markup = params.markup 0124 0125 self.skipRx = None 0126 if params.skip: 0127 flags = re.U 0128 if not params.case: 0129 flags |= re.I 0130 self.skipRx = re.compile(params.skip, flags) 0131 0132 self.pfilters = [[get_hook_ireq(x, abort=True), x] 0133 for x in (params.filter or [])] 0134 0135 self.envs = None 0136 if self.envs is None and params.env is not None: 0137 self.envs = params.env 0138 if self.envs is None and cfgs.string("environment") is not None: 0139 self.envs = cfgs.string("environment").split(",") 0140 if self.envs is None: 0141 self.envs = [] 0142 self.envs = [x.strip() for x in self.envs] 0143 0144 self.suponly = params.suponly 0145 if not self.suponly: 0146 self.suponly = cfgs.boolean("supplements-only", False) 0147 0148 # NOTE: Temporary hack, remove when word splitting becomes smarter. 0149 self.simsp = params.simsp 0150 if not self.simsp: 0151 self.simsp = cfgs.boolean("simple-split", False) 0152 0153 self.lokalize = params.lokalize 0154 0155 # Language-dependent elements built along the way. 0156 self.aspells = {} 0157 self.ignoredContexts = {} 0158 self.personalDicts = {} 0159 self.tmpDictFiles = {} 0160 0161 # Indicators to the caller: 0162 self.caller_sync = False # no need to sync catalogs 0163 self.caller_monitored = False # no need for monitored messages 0164 0165 0166 def process_header (self, hdr, cat): 0167 0168 # Check if the catalog itself states the language, and if yes, 0169 # create the language-dependent stuff if not already created 0170 # for this language. 0171 clang = self.lang or cat.language() 0172 if not clang: 0173 raise SieveCatalogError( 0174 _("@info", 0175 "Cannot determine language for catalog '%(file)s'.", 0176 file=cat.filename)) 0177 cenvs = self.envs or cat.environment() or [] 0178 ckey = (clang, tuple(cenvs)) 0179 if ckey not in self.aspells: 0180 # New language. 0181 self.aspellOptions["lang"] = clang.encode(self.loc_encoding) 0182 0183 # Get Pology's internal personal dictonary for this langenv. 0184 if ckey not in self.personalDicts: # may be in but None 0185 self.personalDicts[ckey] = self._get_personal_dict(clang, cenvs) 0186 if self.personalDicts[ckey]: 0187 self.aspellOptions["personal-path"] = self.personalDicts[ckey].encode(self.loc_encoding) 0188 else: 0189 self.aspellOptions.pop("personal-path", None) # remove previous 0190 0191 if not self.suponly: 0192 # Create Aspell object. 0193 import pology.external.pyaspell as A 0194 try: 0195 self.aspells[ckey] = A.Aspell(list(self.aspellOptions.items())) 0196 except A.AspellConfigError as e: 0197 raise SieveError( 0198 _("@info", 0199 "Aspell configuration error:\n%(msg)s", 0200 msg=e)) 0201 except A.AspellError as e: 0202 raise SieveError( 0203 _("@info", 0204 "Cannot initialize Aspell:\n%(msg)s", 0205 msg=e)) 0206 else: 0207 # Create simple internal checker that only checks against 0208 # internal supplemental dictionaries. 0209 personalDict=self.personalDicts[ckey] 0210 if not personalDict: 0211 raise SieveError(_("@info", 0212 "No supplemental dictionaries found.")) 0213 self.aspells[ckey]=_QuasiSpell(personalDict, self.encoding) 0214 0215 # Load list of contexts by which to ignore messages. 0216 self.ignoredContexts[ckey] = [] 0217 ignoredContextFile=join(datadir(), "lang", clang, "spell", "ignoredContext") 0218 if isfile(ignoredContextFile): 0219 for line in open(ignoredContextFile, "r", "utf-8"): 0220 line=line.strip() 0221 if line.startswith("#") or line=="": 0222 continue 0223 else: 0224 self.ignoredContexts[ckey].append(line.lower()) 0225 0226 # Get language-dependent stuff. 0227 self.aspell = self.aspells[ckey] 0228 self.ignoredContext = self.ignoredContexts[ckey] 0229 0230 # Force explicitly given accelerators and markup. 0231 if self.accel is not None: 0232 cat.set_accelerator(self.accel) 0233 if self.markup is not None: 0234 cat.set_markup(self.markup) 0235 0236 # Close previous/open new XML section. 0237 if self.xmlFile: 0238 filename = os.path.basename(cat.filename) 0239 # Close previous PO. 0240 if self.filename != "": 0241 self.xmlFile.write("</po>\n") 0242 self.filename = filename 0243 # Open new PO. 0244 poTag='<po name="%s">\n' % filename 0245 self.xmlFile.write(poTag) # Write to result 0246 0247 0248 def process (self, msg, cat): 0249 0250 if not msg.translated: 0251 return 0252 0253 id=0 # Count msgstr plural forms 0254 failedSuggs=[] # pairs of wrong words and suggestions 0255 0256 for msgstr in msg.msgstr: 0257 # Skip message with context in the ignoredContext list 0258 skip=False 0259 for context in self.ignoredContext: 0260 if context in (msg.msgctxt or "").lower(): 0261 skip=True 0262 break 0263 for comment in msg.auto_comment: 0264 if context in comment.lower(): 0265 skip=True 0266 break 0267 if skip: 0268 break 0269 if skip: 0270 break 0271 0272 # Skip message if explicitly requested. 0273 if flag_no_check_spell in manc_parse_flag_list(msg, "|"): 0274 continue 0275 0276 # Apply precheck filters. 0277 for pfilter, pfname in self.pfilters: 0278 try: # try as type F1A hook 0279 msgstr = pfilter(msgstr) 0280 except TypeError: 0281 try: # try as type F3* hook 0282 msgstr = pfilter(msgstr, msg, cat) 0283 except TypeError: 0284 raise SieveError( 0285 _("@info", 0286 "Cannot execute filter '%(filt)s'.", 0287 filt=pfname)) 0288 0289 # Split text into words. 0290 if not self.simsp: 0291 words=proper_words(msgstr, True, cat.accelerator(), msg.format) 0292 else: 0293 # NOTE: Temporary, remove when proper_words becomes smarter. 0294 words=msgstr.split() 0295 0296 # Eliminate from checking words matching the skip regex. 0297 if self.skipRx: 0298 words = [x for x in words if not self.skipRx.search(x)] 0299 0300 # Eliminate from checking words explicitly listed as good. 0301 locally_ignored = manc_parse_list(msg, elist_well_spelled, ",") 0302 words = [x for x in words if x not in locally_ignored] 0303 0304 for word in words: 0305 # Encode word for Aspell. 0306 encodedWord=word.encode(self.encoding) 0307 spell=self.aspell.check(encodedWord) 0308 if spell is False: 0309 try: 0310 self.nmatch+=1 0311 if self.unknownWords is not None: 0312 self.unknownWords.add(word) 0313 else: 0314 encodedSuggestions=self.aspell.suggest(encodedWord) 0315 suggestions=[i.decode(self.encoding) for i in encodedSuggestions] 0316 failedSuggs.append((word, suggestions)) 0317 if self.xmlFile: 0318 xmlError=spell_xml_error(msg, cat, word, suggestions, id) 0319 self.xmlFile.writelines(xmlError) 0320 else: 0321 spell_error(msg, cat, word, suggestions) 0322 except UnicodeEncodeError: 0323 warning(_("@info", 0324 "Cannot encode word '%(word)s' in " 0325 "selected encoding '%(enc)s'.", 0326 word=word, enc=self.encoding)) 0327 id+=1 # Increase msgstr id count 0328 0329 if failedSuggs and self.lokalize: 0330 repls=[_("@label", "Spelling errors:")] 0331 for word, suggs in failedSuggs: 0332 if suggs: 0333 fmtsuggs=format_item_list(suggs) 0334 repls.append(_("@item", 0335 "%(word)s (suggestions: %(wordlist)s)", 0336 word=word, wordlist=fmtsuggs)) 0337 else: 0338 repls.append("%s" % (word)) 0339 report_msg_to_lokalize(msg, cat, cjoin(repls, "\n")) 0340 0341 0342 def finalize (self): 0343 # Remove composited personal dictionaries. 0344 for tmpDictFile in list(self.tmpDictFiles.values()): 0345 if isfile(tmpDictFile): 0346 os.unlink(tmpDictFile) 0347 0348 if self.unknownWords is not None: 0349 slist = list(self.unknownWords) 0350 if slist: 0351 slist.sort(lambda x, y: locale.strcoll(x.lower(), y.lower())) 0352 report("\n".join(slist)) 0353 else: 0354 if self.nmatch: 0355 msg = n_("@info:progress", 0356 "Encountered %(num)d unknown word.", 0357 "Encountered %(num)d unknown words.", 0358 num=self.nmatch) 0359 report("===== " + msg) 0360 if self.xmlFile: 0361 self.xmlFile.write("</po>\n") 0362 self.xmlFile.write("</pos>\n") 0363 self.xmlFile.close() 0364 0365 0366 def _encoding_for_aspell (self, enc): 0367 0368 if re.search(r"utf.*8", enc, re.I): 0369 return "UTF-8" 0370 0371 return enc 0372 0373 0374 def _get_personal_dict (self, lang, envs): 0375 # Collect all personal dictionaries found for given 0376 # language/environment and composit them into one to pass to Aspell. 0377 0378 dictFiles=set() 0379 for env in (envs or [""]): 0380 dictFiles.update(self._get_word_list_files(lang, env)) 0381 dictFiles=list(dictFiles) 0382 dictFiles.sort() 0383 0384 if not dictFiles: 0385 return None 0386 0387 # If only one, Aspell can just use it. 0388 if len(dictFiles)<2: 0389 return dictFiles[0] 0390 0391 # Composite all dictionary files into one temporary. 0392 words=[] 0393 for dictFile in dictFiles: 0394 words.extend(_read_dict_file(dictFile)) 0395 tmpDictFile=("compdict-%d.aspell" % os.getpid()) 0396 self.tmpDictFiles[lang]=tmpDictFile 0397 file=open(tmpDictFile, "w", "UTF-8") 0398 file.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words))) 0399 file.writelines([x+"\n" for x in words]) 0400 file.close() 0401 return tmpDictFile 0402 0403 0404 def _get_word_list_files (self, lang, env): 0405 # Collect all applicable dictionaries. 0406 0407 dictFiles=set() 0408 spellRoot=join(datadir(), "lang", lang, "spell") 0409 spellSub=join(".", (env or "")) 0410 while spellSub: 0411 spellDir=join(spellRoot, spellSub) 0412 if isdir(spellDir): 0413 for item in os.listdir(spellDir): 0414 if item.endswith(".aspell"): 0415 dictFiles.add(join(spellDir, item)) 0416 spellSub=dirname(spellSub) 0417 return dictFiles 0418 0419 0420 # Read words from an Aspell personal dictionary. 0421 def _read_dict_file (fname): 0422 0423 # Parse the header for encoding. 0424 encDefault="UTF-8" 0425 file=open(fname, "r", encDefault) 0426 header=file.readline() 0427 m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) 0428 if not m: 0429 warning(_("@info", 0430 "Malformed header in dictionary file '%(file)s'.", 0431 file=filepath)) 0432 return [] 0433 enc=m.group(4) 0434 # Reopen in correct encoding if not the default. 0435 if enc.lower() != encDefault.lower(): 0436 file.close() 0437 file=open(fname, "r", enc) 0438 0439 # Read words. 0440 words=[] 0441 for line in file: 0442 word=line.strip() 0443 if word: 0444 words.append(word) 0445 return words 0446 0447 0448 # Simple spell checker which reads Aspell's personal dictionary file. 0449 class _QuasiSpell (object): 0450 0451 def __init__ (self, dictfile, encoding="UTF-8"): 0452 0453 self.validWords = _read_dict_file(dictfile) 0454 self.encoding = encoding # of the raw text sent in for checking 0455 0456 0457 def check (self, encWord): 0458 0459 word=str.decode(encWord, self.encoding) 0460 if ( word not in self.validWords 0461 and word.lower() not in self.validWords 0462 ): 0463 return False 0464 return True 0465 0466 0467 def suggest (self, encWord): 0468 0469 return [] 0470