pology/sieve/check_spell_ec.py

0001 # -*- coding: utf-8 -*-
0002
0003 """
0004 Spell-check translation using Enchant (U{http://www.abisource.com/projects/enchant/}).
0005
0006 Documented in C{doc/user/sieving.docbook}.
0007
0008 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0009 @license: GPLv3
0010 """
0011
0012 import codecs
0013 from functools import cmp_to_key
0014 from locale import strcoll
0015 import os
0016 from pathlib import Path
0017 import re
0018 import tempfile
0019 from time import strftime
0020
0021 from pology import PologyError, datadir, _, n_
0022 from pology.spell import flag_no_check_spell, elist_well_spelled
0023 from pology.colors import cjoin
0024 from pology.comments import manc_parse_list, manc_parse_flag_list
0025 import pology.config as cfg
0026 from pology.getfunc import get_hook_ireq
0027 from pology.msgreport import report_on_msg
0028 from pology.msgreport import report_msg_to_lokalize
0029 from pology.msgreport import spell_xml_error
0030 from pology.report import report, warning, format_item_list
0031 from pology.sieve import SieveError, SieveCatalogError
0032 from pology.split import proper_words
0033 from pology.sieve import add_param_spellcheck, add_param_poeditors
0034
0035
0036 def setup_sieve (p):
0037
0038     p.set_desc(_("@info sieve description",
0039     "Spell-check translation using Enchant."
0040     ))
0041
0042     p.add_param("provider", str, seplist=True,
0043                 metavar=_("@info sieve parameter value placeholder", "NAME"),
0044                 desc=_("@info sieve parameter description",
0045     "The spell-checking provider to use. "
0046     "Several provider can be given as comma-separated list."
0047     ))
0048
0049     add_param_spellcheck(p)
0050
0051     p.add_param("xml", str,
0052                 metavar=_("@info sieve parameter value placeholder", "FILE"),
0053                 desc=_("@info sieve parameter description",
0054     "Build XML report file at given path."
0055     ))
0056
0057
0058 class Sieve (object):
0059
0060     def __init__ (self, params):
0061
0062         cfgs = cfg.section("enchant")
0063
0064         self.providers = (   ",".join(params.provider or "")
0065                           or cfgs.string("provider")
0066                           or None)
0067
0068         self.lang = (   params.lang
0069                      or cfgs.string("language")
0070                      or None)
0071
0072         self.envs = params.env
0073         if self.envs is None and cfgs.string("environment") is not None:
0074             self.envs = cfgs.string("environment").split(",")
0075         if self.envs is None:
0076             self.envs = []
0077         self.envs = [x.strip() for x in self.envs]
0078
0079         self.accel = params.accel
0080
0081         self.markup = params.markup
0082
0083         self.skip_rx = None
0084         if params.skip is not None:
0085             flags = re.U
0086             if not params.case:
0087                 flags |= re.I
0088             self.skip_rx = re.compile(params.skip, flags)
0089
0090         self.pfilters = [[get_hook_ireq(x, abort=True), x]
0091                          for x in (params.filter or [])]
0092
0093         self.suponly = params.suponly
0094
0095         self.words_only = params.list
0096         self.lokalize = params.lokalize
0097
0098         # File we are processing
0099         self.filename = ""
0100         # File used for the XML output, if requested
0101         self.xmlFile = None
0102
0103         # Langenv-dependent elements built along the way.
0104         self.checkers = {}
0105         self.word_lists = {}
0106
0107         # Tracking of unknown words.
0108         self.unknown_words = set()
0109
0110         # Indicators to the caller:
0111         self.caller_sync = False # no need to sync catalogs
0112         self.caller_monitored = False # no need for monitored messages
0113
0114         if params.xml:
0115             try:
0116                 # TODO: create nice api to manage xml file in rules.py
0117                 self.xmlFile = Path(params.xml).open("w")
0118                 self.xmlFile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
0119                 self.xmlFile.write('<pos date="%s">\n' % strftime('%c'))
0120             except Exception as exc:
0121                 warning(_("@info",
0122                           "Cannot open file '%(file)s': %(ex)s. XML output "
0123                           "disabled.", file=params.xml, ex=exc))
0124
0125
0126     def process_header (self, hdr, cat):
0127
0128         # Check if the catalog itself states the language, and if yes,
0129         # create the language-dependent stuff if not already created
0130         # for this langenv.
0131         clang = self.lang or cat.language()
0132         if not clang:
0133             raise SieveCatalogError(
0134                 _("@info",
0135                   "Cannot determine language for catalog '%(file)s'.",
0136                   file=cat.filename))
0137         cenvs = self.envs or cat.environment() or []
0138         ckey = (clang, tuple(cenvs))
0139         if ckey not in self.checkers:
0140             # Get Pology's internal word list for this langenv.
0141             if clang not in self.word_lists: # may be in but None
0142                 self.word_lists[ckey] = _compose_word_list(clang, cenvs)
0143             # Create spell-checker object.
0144             clang_mod = (self.suponly and [None] or [clang])[0]
0145             checker = _create_checker(self.providers, clang_mod,
0146                                       self.word_lists[ckey])
0147             if not checker:
0148                 raise SieveError(
0149                     _("@info",
0150                       "No spelling dictionary for language '%(lang)s' and "
0151                       "provider '%(prov)s'.",
0152                       lang=clang, prov=self.providers))
0153             self.checkers[ckey] = checker
0154
0155         # Get language-dependent stuff.
0156         self.checker = self.checkers[ckey]
0157
0158         # Force explicitly given accelerators and markup.
0159         if self.accel is not None:
0160             cat.set_accelerator(self.accel)
0161         if self.markup is not None:
0162             cat.set_markup(self.markup)
0163
0164         # Close previous/open new XML section.
0165         if self.xmlFile:
0166             filename = Path(cat.filename).name
0167             # Close previous PO.
0168             if self.filename != "":
0169                 self.xmlFile.write("</po>\n")
0170             self.filename = filename
0171             # Open new PO.
0172             poTag='<po name="%s">\n' % filename
0173             self.xmlFile.write(poTag) # Write to result
0174
0175
0176     def process (self, msg, cat):
0177
0178         if not msg.translated:
0179             return
0180
0181         failed_w_suggs = []
0182         msgstr_cnt = 0
0183
0184         for msgstr in msg.msgstr:
0185
0186             # Skip message if explicitly requested.
0187             if flag_no_check_spell in manc_parse_flag_list(msg, "|"):
0188                 continue
0189
0190             # Apply precheck filters.
0191             for pfilter, pfname in self.pfilters:
0192                 try: # try as type F1A hook
0193                     msgstr = pfilter(msgstr)
0194                 except TypeError:
0195                     try: # try as type F3* hook
0196                         msgstr = pfilter(msgstr, msg, cat)
0197                     except TypeError:
0198                         raise SieveError(
0199                             _("@info",
0200                               "Cannot execute filter '%(filt)s'.",
0201                               filt=pfname))
0202
0203             # Split text into words.
0204             # TODO: See to use markup types somehow.
0205             words = proper_words(msgstr, True, cat.accelerator(), msg.format)
0206
0207             # Eliminate from checking words matching the skip regex.
0208             if self.skip_rx:
0209                 words = [x for x in words if not self.skip_rx.search(x)]
0210
0211             # Eliminate from checking words explicitly listed as good.
0212             locally_ignored = manc_parse_list(msg, elist_well_spelled, ",")
0213             words = [x for x in words if x not in locally_ignored]
0214
0215             for word in words:
0216                 if not self.checker.check(word):
0217                     failed = True
0218                     self.unknown_words.add(word)
0219
0220                     if not self.words_only or self.lokalize:
0221                         suggs = self.checker.suggest(word)
0222                         incmp = False
0223                         if len(suggs) > 5: # do not put out too many words
0224                             suggs = suggs[:5]
0225                             incmp = True
0226                         failed_w_suggs.append((word, suggs))
0227
0228                     if not self.words_only:
0229                         if self.xmlFile:
0230                             xmlError = spell_xml_error(msg, cat, word, suggs,
0231                                                        msgstr_cnt)
0232                             self.xmlFile.writelines(xmlError)
0233
0234                         if suggs:
0235                             fsuggs = format_item_list(suggs, incmp=incmp)
0236                             report_on_msg(_("@info",
0237                                             "Unknown word '%(word)s' "
0238                                             "(suggestions: %(wordlist)s).",
0239                                             word=word, wordlist=fsuggs),
0240                                           msg, cat)
0241                         else:
0242                             report_on_msg(_("@info",
0243                                             "Unknown word '%(word)s'.",
0244                                             word=word),
0245                                           msg, cat)
0246
0247             msgstr_cnt += 1 # Increase msgstr id count
0248
0249         if self.lokalize and failed_w_suggs:
0250             repls = [_("@label", "Spelling errors:")]
0251             for word, suggs in failed_w_suggs:
0252                 if suggs:
0253                     fmtsuggs=format_item_list(suggs, incmp=incmp)
0254                     repls.append(_("@item",
0255                                    "%(word)s (suggestions: %(wordlist)s)",
0256                                    word=word, wordlist=fmtsuggs))
0257                 else:
0258                     repls.append("%s" % (word))
0259             report_msg_to_lokalize(msg, cat, cjoin(repls, "\n"))
0260
0261
0262     def finalize (self):
0263
0264         if self.unknown_words:
0265             if not self.words_only:
0266                 nwords = len(self.unknown_words)
0267                 msg = n_("@info:progress",
0268                          "Encountered %(num)d unknown word.",
0269                          "Encountered %(num)d unknown words.",
0270                          num=nwords)
0271                 report("===== " + msg)
0272             else:
0273                 wlist = list(self.unknown_words)
0274                 wlist.sort(key=cmp_to_key(strcoll))
0275                 report("\n".join(wlist))
0276
0277         if self.xmlFile:
0278             self.xmlFile.write("</po>\n")
0279             self.xmlFile.write("</pos>\n")
0280             self.xmlFile.close()
0281
0282
0283 # Get checker object from Enchant.
0284 def _create_checker (providers, langtag, words):
0285
0286     try:
0287         import enchant
0288     except ImportError:
0289         pkgs = ["python-enchant"]
0290         raise PologyError(_("@info",
0291                             "Python wrapper for Enchant not found, "
0292                             "please install it (possible package names: "
0293                             "%(pkglist)s).",
0294                             pkglist=format_item_list(pkgs)))
0295
0296     if langtag is not None:
0297         try:
0298             broker = enchant.Broker()
0299             if providers is not None:
0300                 broker.set_ordering(langtag, providers)
0301             checker = broker.request_dict(langtag)
0302             checker.check(".")
0303         except:
0304             checker = None
0305     else:
0306         tmpf = tempfile.NamedTemporaryFile()
0307         tmpf.close()
0308         checker = enchant.request_pwl_dict(tmpf.name)
0309         os.unlink(tmpf.name)
0310
0311     if checker:
0312         pname = checker.provider.name.split()[0].lower()
0313         need_upcasing = (pname in ("personal", "myspell"))
0314         for word in words or []:
0315             checker.add_to_session(word)
0316             if need_upcasing:
0317                 checker.add_to_session(word[0].upper() + word[1:])
0318                 checker.add_to_session(word.upper())
0319     return checker
0320
0321
0322 # Collect words from all internal word lists
0323 # available for given language+environment.
0324 def _compose_word_list (lang, envs):
0325
0326     # Collect all applicable word list files.
0327     wlist_files = set()
0328     for env in (envs or [""]):
0329         wlist_files.update(_get_word_list_files(lang, env))
0330     wlist_files = list(wlist_files)
0331     wlist_files.sort()
0332
0333     # Read words.
0334     words = []
0335     for wlist_file in wlist_files:
0336         words.extend(_read_wlist_aspell(wlist_file))
0337     return words
0338
0339
0340 def _get_word_list_files (lang, env):
0341
0342     # Collect word list paths.
0343     wlist_files = set()
0344     spell_root = os.path.join(datadir(), "lang", lang, "spell")
0345     spell_subdir = os.path.join(".", (env or ""))
0346     while spell_subdir:
0347         spell_dir = os.path.join(spell_root, spell_subdir)
0348         if os.path.isdir(spell_dir):
0349             for item in os.listdir(spell_dir):
0350                 if item.endswith(".aspell"):
0351                     wlist_files.add(os.path.join(spell_dir, item))
0352         spell_subdir = os.path.dirname(spell_subdir)
0353     return wlist_files
0354
0355
0356 # Read words from an Aspell word list.
0357 def _read_wlist_aspell (fname):
0358
0359     # Parse the header for encoding.
0360
0361     defenc = "UTF-8"
0362     fl = codecs.open(fname, "r", defenc)
0363     header = fl.readline()
0364     m = re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header)
0365     if not m:
0366         warning(_("@info",
0367                   "Malformed header in dictionary file '%(file)s'.",
0368                   file=fname))
0369         return []
0370     enc = m.group(4)
0371     # Reopen in correct encoding if not the default.
0372     if enc.lower() != defenc.lower():
0373         fl.close()
0374         fl = codecs.open(fname, "r", enc)
0375
0376     # Read words.
0377     words = []
0378     for line in fl:
0379         word = line.strip()
0380         if word:
0381             words.append(word)
0382     return words