pology/sieve/check_spell.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Spell-check translation using GNU Aspell (U{http://aspell.net/}).
0005
0006 Documented in C{doc/user/sieving.docbook}.
0007
0008 @author: Sébastien Renard <sebastien.renard@digitalfox.org>
0009 @license: GPLv3
0010 """
0011
0012 from codecs import open
0013 import locale
0014 import os
0015 from os.path import abspath, basename, dirname, isfile, isdir, join
0016 import re
0017 import sys
0018 from time import strftime
0019
0020 from pology import datadir, _, n_
0021 from pology.spell import flag_no_check_spell, elist_well_spelled
0022 from pology.colors import cjoin
0023 from pology.comments import manc_parse_list, manc_parse_flag_list
0024 import pology.config as cfg
0025 from pology.getfunc import get_hook_ireq
0026 from pology.msgreport import spell_error, spell_xml_error
0027 from pology.msgreport import report_msg_to_lokalize
0028 from pology.report import report, warning, format_item_list
0029 from pology.sieve import SieveError, SieveCatalogError
0030 from pology.split import proper_words
0031 from pology.sieve import add_param_spellcheck
0032
0033
0034 def setup_sieve (p):
0035
0036     p.set_desc(_("@info sieve discription",
0037     "Spell-check translation using Aspell."
0038     ))
0039
0040     add_param_spellcheck(p)
0041
0042     p.add_param("enc", str,
0043                 metavar=_("@info sieve parameter value placeholder",
0044                           "ENCODING"),
0045                 desc=_("@info sieve parameter discription",
0046     "Encoding for text sent to Aspell."
0047     ))
0048     p.add_param("var", str,
0049                 metavar=_("@info sieve parameter value placeholder",
0050                           "VARIETY"),
0051                 desc=_("@info sieve parameter discription",
0052     "Variety of the Aspell dictionary."
0053     ))
0054     p.add_param("xml", str,
0055                 metavar=_("@info sieve parameter value placeholder", "FILE"),
0056                 desc=_("@info sieve parameter discription",
0057     "Build XML report file at given path."
0058     ))
0059     p.add_param("simsp", bool, defval=False,
0060                 desc=_("@info sieve parameter discription",
0061     "Split text into words in a simpler way (deprecated,)."
0062     ))
0063
0064
0065 class Sieve (object):
0066     """Process messages through the Aspell spell checker"""
0067
0068     def __init__ (self, params):
0069
0070         self.nmatch = 0 # Number of match for finalize
0071         self.unknownWords=None # If not None, only list of faulty word is display (to ease copy/paste into personal dictionary)
0072         self.filename=""     # File name we are processing
0073         self.xmlFile=None # File handle to write XML output
0074
0075         # Build Aspell options.
0076         self.aspellOptions = {}
0077
0078         # - assume markup in messages (provide option to disable?)
0079         self.aspellOptions["mode"] = "sgml"
0080         # FIXME: In fact not needed? The words are sent parsed to checker.
0081
0082         self.lang = params.lang
0083         self.encoding = params.enc
0084         self.variety = params.var
0085
0086         cfgs = cfg.section("aspell")
0087         if not self.lang:
0088             self.lang = cfgs.string("language")
0089         if not self.encoding:
0090             self.encoding = cfgs.string("encoding")
0091         if not self.variety:
0092             self.variety = cfgs.string("variety")
0093
0094         self.loc_encoding = locale.getlocale()[1]
0095         if not self.encoding:
0096             self.encoding = self.loc_encoding
0097         if not self.encoding:
0098             self.encoding = "UTF-8"
0099
0100         self.encoding = self._encoding_for_aspell(self.loc_encoding)
0101         self.aspellOptions["lang"] = self.lang.encode(self.loc_encoding) if self.lang else None
0102         self.aspellOptions["encoding"] = self.encoding.encode(self.loc_encoding)
0103         if self.variety:
0104             self.aspellOptions["variety"] = self.variety.encode(self.loc_encoding) if self.variety else None
0105
0106         self.unknownWords = None
0107         if params.list:
0108             self.unknownWords = set()
0109
0110         if params.xml:
0111             xmlPath=params.xml
0112             if os.access(dirname(abspath(xmlPath)), os.W_OK):
0113                 #TODO: create nice api to manage xml file and move it to rules.py
0114                 self.xmlFile=open(xmlPath, "w", "utf-8")
0115                 self.xmlFile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
0116                 self.xmlFile.write('<pos date="%s">\n' % strftime('%c'))
0117             else:
0118                 warning(_("@info",
0119                           "Cannot open file '%(file)s'. XML output disabled.",
0120                           file=xmlPath))
0121
0122         self.accel = params.accel
0123         self.markup = params.markup
0124
0125         self.skipRx = None
0126         if params.skip:
0127             flags = re.U
0128             if not params.case:
0129                 flags |= re.I
0130             self.skipRx = re.compile(params.skip, flags)
0131
0132         self.pfilters = [[get_hook_ireq(x, abort=True), x]
0133                          for x in (params.filter or [])]
0134
0135         self.envs = None
0136         if self.envs is None and params.env is not None:
0137             self.envs = params.env
0138         if self.envs is None and cfgs.string("environment") is not None:
0139             self.envs = cfgs.string("environment").split(",")
0140         if self.envs is None:
0141             self.envs = []
0142         self.envs = [x.strip() for x in self.envs]
0143
0144         self.suponly = params.suponly
0145         if not self.suponly:
0146             self.suponly = cfgs.boolean("supplements-only", False)
0147
0148         # NOTE: Temporary hack, remove when word splitting becomes smarter.
0149         self.simsp = params.simsp
0150         if not self.simsp:
0151             self.simsp = cfgs.boolean("simple-split", False)
0152
0153         self.lokalize = params.lokalize
0154
0155         # Language-dependent elements built along the way.
0156         self.aspells = {}
0157         self.ignoredContexts = {}
0158         self.personalDicts = {}
0159         self.tmpDictFiles = {}
0160
0161         # Indicators to the caller:
0162         self.caller_sync = False # no need to sync catalogs
0163         self.caller_monitored = False # no need for monitored messages
0164
0165
0166     def process_header (self, hdr, cat):
0167
0168         # Check if the catalog itself states the language, and if yes,
0169         # create the language-dependent stuff if not already created
0170         # for this language.
0171         clang = self.lang or cat.language()
0172         if not clang:
0173             raise SieveCatalogError(
0174                 _("@info",
0175                   "Cannot determine language for catalog '%(file)s'.",
0176                   file=cat.filename))
0177         cenvs = self.envs or cat.environment() or []
0178         ckey = (clang, tuple(cenvs))
0179         if ckey not in self.aspells:
0180             # New language.
0181             self.aspellOptions["lang"] = clang.encode(self.loc_encoding)
0182
0183             # Get Pology's internal personal dictonary for this langenv.
0184             if ckey not in self.personalDicts: # may be in but None
0185                 self.personalDicts[ckey] = self._get_personal_dict(clang, cenvs)
0186             if self.personalDicts[ckey]:
0187                 self.aspellOptions["personal-path"] = self.personalDicts[ckey].encode(self.loc_encoding)
0188             else:
0189                 self.aspellOptions.pop("personal-path", None) # remove previous
0190
0191             if not self.suponly:
0192                 # Create Aspell object.
0193                 import pology.external.pyaspell as A
0194                 try:
0195                     self.aspells[ckey] = A.Aspell(list(self.aspellOptions.items()))
0196                 except A.AspellConfigError as e:
0197                     raise SieveError(
0198                         _("@info",
0199                           "Aspell configuration error:\n%(msg)s",
0200                           msg=e))
0201                 except A.AspellError as e:
0202                     raise SieveError(
0203                         _("@info",
0204                           "Cannot initialize Aspell:\n%(msg)s",
0205                           msg=e))
0206             else:
0207                 # Create simple internal checker that only checks against
0208                 # internal supplemental dictionaries.
0209                 personalDict=self.personalDicts[ckey]
0210                 if not personalDict:
0211                     raise SieveError(_("@info",
0212                                        "No supplemental dictionaries found."))
0213                 self.aspells[ckey]=_QuasiSpell(personalDict, self.encoding)
0214
0215             # Load list of contexts by which to ignore messages.
0216             self.ignoredContexts[ckey] = []
0217             ignoredContextFile=join(datadir(), "lang", clang, "spell", "ignoredContext")
0218             if isfile(ignoredContextFile):
0219                 for line in open(ignoredContextFile, "r", "utf-8"):
0220                     line=line.strip()
0221                     if line.startswith("#") or line=="":
0222                         continue
0223                     else:
0224                         self.ignoredContexts[ckey].append(line.lower())
0225
0226         # Get language-dependent stuff.
0227         self.aspell = self.aspells[ckey]
0228         self.ignoredContext = self.ignoredContexts[ckey]
0229
0230         # Force explicitly given accelerators and markup.
0231         if self.accel is not None:
0232             cat.set_accelerator(self.accel)
0233         if self.markup is not None:
0234             cat.set_markup(self.markup)
0235
0236         # Close previous/open new XML section.
0237         if self.xmlFile:
0238             filename = os.path.basename(cat.filename)
0239             # Close previous PO.
0240             if self.filename != "":
0241                 self.xmlFile.write("</po>\n")
0242             self.filename = filename
0243             # Open new PO.
0244             poTag='<po name="%s">\n' % filename
0245             self.xmlFile.write(poTag) # Write to result
0246
0247
0248     def process (self, msg, cat):
0249
0250         if not msg.translated:
0251             return
0252
0253         id=0 # Count msgstr plural forms
0254         failedSuggs=[] # pairs of wrong words and suggestions
0255
0256         for msgstr in msg.msgstr:
0257             # Skip message with context in the ignoredContext list
0258             skip=False
0259             for context in self.ignoredContext:
0260                 if context in (msg.msgctxt or "").lower():
0261                     skip=True
0262                     break
0263                 for comment in msg.auto_comment:
0264                     if context in comment.lower():
0265                         skip=True
0266                         break
0267                 if skip:
0268                     break
0269             if skip:
0270                 break
0271
0272             # Skip message if explicitly requested.
0273             if flag_no_check_spell in manc_parse_flag_list(msg, "|"):
0274                 continue
0275
0276             # Apply precheck filters.
0277             for pfilter, pfname in self.pfilters:
0278                 try: # try as type F1A hook
0279                     msgstr = pfilter(msgstr)
0280                 except TypeError:
0281                     try: # try as type F3* hook
0282                         msgstr = pfilter(msgstr, msg, cat)
0283                     except TypeError:
0284                         raise SieveError(
0285                             _("@info",
0286                               "Cannot execute filter '%(filt)s'.",
0287                               filt=pfname))
0288
0289             # Split text into words.
0290             if not self.simsp:
0291                 words=proper_words(msgstr, True, cat.accelerator(), msg.format)
0292             else:
0293                 # NOTE: Temporary, remove when proper_words becomes smarter.
0294                 words=msgstr.split()
0295
0296             # Eliminate from checking words matching the skip regex.
0297             if self.skipRx:
0298                 words = [x for x in words if not self.skipRx.search(x)]
0299
0300             # Eliminate from checking words explicitly listed as good.
0301             locally_ignored = manc_parse_list(msg, elist_well_spelled, ",")
0302             words = [x for x in words if x not in locally_ignored]
0303
0304             for word in words:
0305                 # Encode word for Aspell.
0306                 encodedWord=word.encode(self.encoding)
0307                 spell=self.aspell.check(encodedWord)
0308                 if spell is False:
0309                     try:
0310                         self.nmatch+=1
0311                         if self.unknownWords is not None:
0312                             self.unknownWords.add(word)
0313                         else:
0314                             encodedSuggestions=self.aspell.suggest(encodedWord)
0315                             suggestions=[i.decode(self.encoding) for i in encodedSuggestions]
0316                             failedSuggs.append((word, suggestions))
0317                             if self.xmlFile:
0318                                 xmlError=spell_xml_error(msg, cat, word, suggestions, id)
0319                                 self.xmlFile.writelines(xmlError)
0320                             else:
0321                                 spell_error(msg, cat, word, suggestions)
0322                     except UnicodeEncodeError:
0323                         warning(_("@info",
0324                                   "Cannot encode word '%(word)s' in "
0325                                   "selected encoding '%(enc)s'.",
0326                                   word=word, enc=self.encoding))
0327             id+=1 # Increase msgstr id count
0328
0329         if failedSuggs and self.lokalize:
0330             repls=[_("@label", "Spelling errors:")]
0331             for word, suggs in failedSuggs:
0332                 if suggs:
0333                     fmtsuggs=format_item_list(suggs)
0334                     repls.append(_("@item",
0335                                    "%(word)s (suggestions: %(wordlist)s)",
0336                                    word=word, wordlist=fmtsuggs))
0337                 else:
0338                     repls.append("%s" % (word))
0339             report_msg_to_lokalize(msg, cat, cjoin(repls, "\n"))
0340
0341
0342     def finalize (self):
0343         # Remove composited personal dictionaries.
0344         for tmpDictFile in list(self.tmpDictFiles.values()):
0345             if isfile(tmpDictFile):
0346                 os.unlink(tmpDictFile)
0347
0348         if self.unknownWords is not None:
0349             slist = list(self.unknownWords)
0350             if slist:
0351                 slist.sort(lambda x, y: locale.strcoll(x.lower(), y.lower()))
0352                 report("\n".join(slist))
0353         else:
0354             if self.nmatch:
0355                 msg = n_("@info:progress",
0356                          "Encountered %(num)d unknown word.",
0357                          "Encountered %(num)d unknown words.",
0358                          num=self.nmatch)
0359                 report("===== " + msg)
0360         if self.xmlFile:
0361             self.xmlFile.write("</po>\n")
0362             self.xmlFile.write("</pos>\n")
0363             self.xmlFile.close()
0364
0365
0366     def _encoding_for_aspell (self, enc):
0367
0368         if re.search(r"utf.*8", enc, re.I):
0369             return "UTF-8"
0370
0371         return enc
0372
0373
0374     def _get_personal_dict (self, lang, envs):
0375         # Collect all personal dictionaries found for given
0376         # language/environment and composit them into one to pass to Aspell.
0377
0378         dictFiles=set()
0379         for env in (envs or [""]):
0380             dictFiles.update(self._get_word_list_files(lang, env))
0381         dictFiles=list(dictFiles)
0382         dictFiles.sort()
0383
0384         if not dictFiles:
0385             return None
0386
0387         # If only one, Aspell can just use it.
0388         if len(dictFiles)<2:
0389             return dictFiles[0]
0390
0391         # Composite all dictionary files into one temporary.
0392         words=[]
0393         for dictFile in dictFiles:
0394             words.extend(_read_dict_file(dictFile))
0395         tmpDictFile=("compdict-%d.aspell" % os.getpid())
0396         self.tmpDictFiles[lang]=tmpDictFile
0397         file=open(tmpDictFile, "w", "UTF-8")
0398         file.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words)))
0399         file.writelines([x+"\n" for x in words])
0400         file.close()
0401         return tmpDictFile
0402
0403
0404     def _get_word_list_files (self, lang, env):
0405         # Collect all applicable dictionaries.
0406
0407         dictFiles=set()
0408         spellRoot=join(datadir(), "lang", lang, "spell")
0409         spellSub=join(".", (env or ""))
0410         while spellSub:
0411             spellDir=join(spellRoot, spellSub)
0412             if isdir(spellDir):
0413                 for item in os.listdir(spellDir):
0414                     if item.endswith(".aspell"):
0415                         dictFiles.add(join(spellDir, item))
0416             spellSub=dirname(spellSub)
0417         return dictFiles
0418
0419
0420 # Read words from an Aspell personal dictionary.
0421 def _read_dict_file (fname):
0422
0423     # Parse the header for encoding.
0424     encDefault="UTF-8"
0425     file=open(fname, "r", encDefault)
0426     header=file.readline()
0427     m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header)
0428     if not m:
0429         warning(_("@info",
0430                   "Malformed header in dictionary file '%(file)s'.",
0431                   file=filepath))
0432         return []
0433     enc=m.group(4)
0434     # Reopen in correct encoding if not the default.
0435     if enc.lower() != encDefault.lower():
0436         file.close()
0437         file=open(fname, "r", enc)
0438
0439     # Read words.
0440     words=[]
0441     for line in file:
0442         word=line.strip()
0443         if word:
0444             words.append(word)
0445     return words
0446
0447
0448 # Simple spell checker which reads Aspell's personal dictionary file.
0449 class _QuasiSpell (object):
0450
0451     def __init__ (self, dictfile, encoding="UTF-8"):
0452
0453         self.validWords = _read_dict_file(dictfile)
0454         self.encoding = encoding # of the raw text sent in for checking
0455
0456
0457     def check (self, encWord):
0458
0459         word=str.decode(encWord, self.encoding)
0460         if (    word not in self.validWords
0461             and word.lower() not in self.validWords
0462         ):
0463             return False
0464         return True
0465
0466
0467     def suggest (self, encWord):
0468
0469         return []
0470