File indexing completed on 2024-04-21 16:29:17

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Check spelling in text using different spell checkers.
0005 
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 author: Javier Vinal (Javier Viñal) <fjvinal@gmail.com>
0008 @license: GPLv3
0009 """
0010 
0011 import os
0012 import codecs
0013 import re
0014 import tempfile
0015 
0016 from pology import PologyError, datadir, _, n_
0017 from pology.comments import manc_parse_flag_list, manc_parse_list
0018 import pology.config
0019 from pology.msgreport import report_on_msg
0020 from pology.report import warning, format_item_list
0021 
0022 
0023 # Pipe flag to manually prevent spellcheck for a particular message.
0024 flag_no_check_spell = "no-check-spell"
0025 
0026 # Embedded list of words manually declared valid for a particular message.
0027 elist_well_spelled = "well-spelled:"
0028 
0029 
0030 def check_spell (lang=None, encoding="UTF-8", variety=None, extopts={},
0031                  envs=None, suponly=False, maxsugg=5):
0032     """
0033     Check spelling using Aspell [hook factory].
0034 
0035     Aspell language is selected by the C{lang} parameter, which should be
0036     a language code of one of the installed spelling dictionaries.
0037     Text encoding used by the dictionary is provided by the C{encoding}
0038     parameter. If the dictionary comes in several varieties, a non-default
0039     one is selected using the C{variety} parameter.
0040     Any additional options from the set of Aspell configuration fields can
0041     be passed in as (name, value) dictionary by the C{extopts} parameter.
0042 
0043     Pology may contain internal supplemental dictionaries for selected
0044     language in C{lang/<lang>/spell/} directory, and these are automatically
0045     picked up. Any subdirectories in C{lang/<lang>/spell/} are considered
0046     as to contain supplemental dictionaries in special "environments"
0047     (e.g. jargon, certain projects, etc.), and are not included by default.
0048     Such environments can be included by the C{envs} parameter, which
0049     is a list of relative paths added to C{lang/<lang>/spell/} directory.
0050     All supplemental dictionaries from such paths are included, as well as
0051     from all their parent directories up to C{lang/<lang>/spell/}
0052     (this makes supplemental dictionaries hierarchical, e.g.
0053     environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar}
0054     is requested, both its and supplements of C{foo} are used).
0055 
0056     If C{lang} is C{None}, then automatic detection of the language based
0057     on the catalog of the message is attempted
0058     (see catalog L{language()<catalog.Catalog.language>} method).
0059     Similar is attempted for environments if C{env} is C{None}
0060     (see catalog L{environment()<catalog.Catalog.environment>} method).
0061 
0062     Aspell's system dictionary can be completely excluded from the check
0063     by the C{suponly} parameter, when the check will use only internal
0064     supplemental dictionaries.
0065 
0066     Misspelled words are reported to stdout, with suggestions if available.
0067     Maximum number of suggestions to display is selected by the C{maxsugg}
0068     parameter; if negative, all suggestions are shown.
0069 
0070     Spell checking is performed by internally splitting text into words, and
0071     querying Aspell word by word. Spliting is performed in a simple fashion;
0072     it is assumed that text has been appropriately filtered down to plain text,
0073     e.g. that any XML-like markup and other literals have been removed
0074     (see L{pology.remove} for filtering possibilities).
0075 
0076     Spell checking can be skipped entirely on a message by issuing
0077     the C{no-check-spell} L{sieve flag<sieve.parse_sieve_flags>}.
0078     Alternatively, only certain words may be declared well spelled
0079     by adding a manual comment starting with C{well-spelled:}
0080     and followed by comma-separated list of words. Example::
0081 
0082         # |, no-check-spell
0083         msgid "Aaaargh, gahhh, khh..."
0084         msgstr ""
0085 
0086         # well-spelled: Aaaargh, kh
0087         msgid "Aaaargh, kh, kh... I have been defeated...!"
0088         msgstr ""
0089 
0090     @param lang: language of spelling dictionary
0091     @type lang: string
0092     @param encoding: encoding used by the dictionary
0093     @type encoding: string
0094     @param variety: variety of dictionary
0095     @type variety: string
0096     @param extopts: additional options to send to Aspell
0097     @type extopts: dict
0098     @param envs: environments for supplemental dictionaries
0099     @type envs: list of strings
0100     @param suponly: whether to use only supplemental dictionaries
0101     @type suponly: bool
0102     @param maxsugg: maximum number of suggestions to show for misspelled word
0103     @type maxsugg: int
0104 
0105     @return: type S3A hook
0106     @rtype: C{(text, msg, cat) -> numerr}
0107     """
0108 
0109     provider = "aspell-raw"
0110     return _check_spell_w(provider, lang, encoding, variety, extopts,
0111                           envs, suponly, maxsugg, False)
0112 
0113 
0114 def check_spell_sp (lang=None, encoding="UTF-8", variety=None, extopts={},
0115                     envs=None, suponly=False, maxsugg=5):
0116     """
0117     Like L{check_spell}, except that erroneous spans are returned
0118     instead of reporting problems to stdout [hook factory].
0119 
0120     @return: type V3A hook
0121     @rtype: C{(text, msg, cat) -> spans}
0122     """
0123 
0124     provider = "aspell-raw"
0125     return _check_spell_w(provider, lang, encoding, variety, extopts,
0126                           envs, suponly, maxsugg, True)
0127 
0128 
0129 def _check_spell_w (provider, lang, encoding, variety, extopts,
0130                     envs, suponly, maxsugg, spanrep):
0131     """
0132     Worker for C{check_spell*} hook factories.
0133     """
0134 
0135     # FIXME: It is said that no fancy word-splitting is done on the text,
0136     # but still, best to split it assuming plain text?
0137     wsplit_rx = re.compile("[^\W\d_]+", re.U)
0138     def wsplit (text, msg, cat):
0139         word_spans = []
0140         for m in wsplit_rx.finditer(text):
0141             word, span = m.group(0), m.span()
0142             word_spans.append((word, span))
0143         # ...could have been a single comprehension, but may need expansion.
0144         return word_spans
0145 
0146     # Resolve provider.
0147     if provider != "aspell-raw":
0148         enchant_cfg = pology.config.section("enchant")
0149         if not provider:
0150             provider = enchant_cfg.string("provider")
0151             if not provider:
0152                 raise PologyError(_("@info", "Enchant provider not set."))
0153 
0154     # Cache for constructed checkers.
0155     checkers = {}
0156 
0157     # The checker itself.
0158     def spcheck (text, msg, cat):
0159 
0160         # Check if new spell checker should be constructed.
0161         if lang is not None:
0162             clang = lang
0163         elif cat.language() is not None:
0164             clang = cat.language()
0165         elif provider != "aspell-raw":
0166             clang = enchant_cfg.string("language")
0167         else:
0168             clang = None
0169         if not clang:
0170             raise PologyError(
0171                 _("@info",
0172                   "Cannot determine language for catalog '%(file)s'.",
0173                   file=cat.filename))
0174         if envs is not None:
0175             cenvs = envs
0176         elif cat.environment() is not None:
0177             cenvs = cat.environment()
0178         elif provider != "aspell-raw":
0179             envs_str = enchant_cfg.string("environment")
0180             cenvs = envs_str.split(",") if envs_str else []
0181         else:
0182             cenvs = []
0183         ckey = (clang, tuple(cenvs))
0184         if ckey not in checkers:
0185             if provider != "aspell-raw":
0186                 checkers[ckey] = _construct_enchant(provider, clang, cenvs,
0187                                                     encoding, variety, suponly)
0188             else:
0189                 checkers[ckey] = _construct_aspell(clang, cenvs, encoding,
0190                                                    variety, extopts, suponly)
0191 
0192         checker = checkers[ckey]
0193 
0194         # Prepare shortcut reports.
0195         if spanrep: defret = []
0196         else: defret = 0
0197 
0198         # Skip message if explicitly requested.
0199         if flag_no_check_spell in manc_parse_flag_list(msg, "|"):
0200             return defret
0201 
0202         # Split text into words and spans: [(word, (start, end)), ...]
0203         word_spans = wsplit(text, msg, cat)
0204 
0205         # Ignore words explicitly listed as good.
0206         ignored_words = set(manc_parse_list(msg, elist_well_spelled, ","))
0207         word_spans = [x for x in word_spans if x[0] not in ignored_words]
0208 
0209         spans = []
0210         for word, span in word_spans:
0211             encword = word.encode(encoding)
0212             if not checker.check(encword):
0213                 encsuggs = checker.suggest(encword)
0214                 maxsugg = 5 # limit to some reasonable number
0215                 incmp = False
0216                 if maxsugg > 0 and len(encsuggs) > maxsugg:
0217                     encsuggs = encsuggs[:maxsugg]
0218                     incmp = True
0219                 suggs = [x.decode(encoding) for x in encsuggs]
0220                 if maxsugg != 0 and suggs:
0221                     fmtsuggs = format_item_list(suggs, incmp=incmp)
0222                     snote = _("@info",
0223                               "Unknown word '%(word)s' "
0224                               "(suggestions: %(wordlist)s).",
0225                               word=word, wordlist=fmtsuggs)
0226                 else:
0227                     snote = _("@info",
0228                               "Unknown word '%(word)s'.",
0229                               word=word)
0230                 spans.append(span + (snote,))
0231 
0232         if spanrep:
0233             return spans
0234         else:
0235             for span in spans:
0236                 if span[2:]:
0237                     report_on_msg(span[2], msg, cat)
0238             return len(spans)
0239 
0240     return spcheck
0241 
0242 
0243 # Construct Aspell checker for given langenv.
0244 def _construct_aspell (lang, envs, encoding, variety, extopts, suponly):
0245 
0246     # Get Pology's internal personal dictonary for this language.
0247     dictpath, temporary = _compose_personal_dict(lang, envs)
0248 
0249     if not suponly:
0250         # Prepare Aspell options.
0251         aopts = {}
0252         aopts["lang"] = lang
0253         aopts["encoding"] = encoding
0254         if variety:
0255             aopts["variety"] = variety
0256         if dictpath:
0257             aopts["personal-path"] = dictpath
0258         if extopts:
0259             aopts.update(extopts)
0260 
0261         aopts = dict([(x, y.encode(encoding)) for x, y in list(aopts.items())])
0262 
0263         # Create Aspell object.
0264         import pology.external.pyaspell as A
0265         try:
0266             checker = A.Aspell(list(aopts.items()))
0267         except A.AspellConfigError as e:
0268             raise PologyError(
0269                 _("@info",
0270                   "Aspell configuration error:\n%(msg)s",
0271                   msg=e))
0272         except A.AspellError as e:
0273             raise PologyError(
0274                 _("@info",
0275                   "Cannot initialize Aspell:\n%(msg)s",
0276                   msg=e))
0277     else:
0278         # Create simple internal checker that only checks against
0279         # internal supplemental dictionaries.
0280         if not dictpath:
0281             raise PologyError(
0282                 _("@info",
0283                   "No supplemental dictionaries found."))
0284         checker = _QuasiSpell(dictpath, encoding)
0285 
0286     # Composited dictionary read by now, remove if temporary file.
0287     if temporary:
0288         os.unlink(dictpath)
0289 
0290     return checker
0291 
0292 
0293 # Collect all personal dictionaries found for given language/environment
0294 # and composit them into one file to pass to Aspell.
0295 # Environment is given as a relative subpath into the language directory;
0296 # a dictionary belongs to that environment if it is in the directory
0297 # pointed by the subpath, or any of the parent directories.
0298 # Return the path to composited file or None if there were no dictionaries,
0299 # and whether the file is really a temporary composition or not.
0300 def _compose_personal_dict (lang, envs):
0301 
0302     # Collect all applicable dictionary files
0303     # (for a given environment, in its subdirectiory and all above).
0304     dictpaths = set()
0305     spell_root = os.path.join(datadir(), "lang", lang, "spell")
0306     for env in (envs or [""]):
0307         spell_sub = os.path.join(".", env)
0308         while spell_sub:
0309             spell_dir = os.path.join(spell_root, spell_sub)
0310             if os.path.isdir(spell_dir):
0311                 for item in os.listdir(spell_dir):
0312                     if item.endswith(".aspell"):
0313                         dictpaths.add(os.path.join(spell_dir, item))
0314             spell_sub = os.path.dirname(spell_sub)
0315     dictpaths = list(dictpaths)
0316     dictpaths.sort()
0317 
0318     if not dictpaths:
0319         return None, False
0320 
0321     # If only one dictionary found, Aspell can use it as-is.
0322     if len(dictpaths) == 1:
0323         return dictpaths[0], False
0324 
0325     # Composit all dictionary files into one temporary.
0326     words = []
0327     for dictpath in dictpaths:
0328         words.extend(_read_dict_file(dictpath))
0329     tmpf = tempfile.NamedTemporaryFile()
0330     tmpf.close()
0331     try:
0332         tmpf = codecs.open(tmpf.name, "w", "UTF-8")
0333         tmpf.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words)))
0334         tmpf.writelines([x + "\n" for x in words])
0335         tmpf.close()
0336     except Exception as e:
0337         raise PologyError(
0338             _("@info",
0339               "Cannot create composited spelling dictionary "
0340               "in current working directory:\n%(msg)s",
0341               msg=e))
0342 
0343     return tmpf.name, True
0344 
0345 
0346 # Read words from Aspell personal dictionary.
0347 def _read_dict_file (filepath):
0348 
0349     # Parse the header for encoding.
0350     enc_def = "UTF-8"
0351     file = codecs.open(filepath, "r", enc_def)
0352     header = file.readline()
0353     m = re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header)
0354     if not m:
0355         raise PologyError(
0356             _("@info",
0357               "Malformed header in dictionary file '%(file)s'.",
0358               file=filepath))
0359     enc = m.group(4)
0360     # Reopen in correct encoding if not the default.
0361     if enc.lower() != enc_def.lower():
0362         file.close()
0363         file = codecs.open(filepath, "r", enc)
0364 
0365     # Read words.
0366     words = []
0367     for line in file:
0368         word = line.strip()
0369         if word:
0370             words.append(word)
0371     return words
0372 
0373 
0374 # Simple spell checker which reads Aspell's personal dictionary file.
0375 class _QuasiSpell (object):
0376 
0377     def __init__ (self, dictpath, enc="UTF-8"):
0378 
0379         self._words = _read_dict_file(dictpath)
0380         self._enc = enc # of the raw text sent in for checking
0381 
0382 
0383     def check (self, encword):
0384 
0385         word = str.decode(encword, self._enc)
0386         return (   word in self._words
0387                 or word.lower() in self._words)
0388 
0389 
0390     def suggest (self, encword):
0391 
0392         return []
0393 
0394 
0395 def check_spell_ec (provider=None, lang=None, encoding="UTF-8", variety=None,
0396                     envs=None, suponly=False, maxsugg=5):
0397     """
0398     Check spelling using Enchant [hook factory].
0399 
0400     Enchant provider and language are selected by the C{lang} parameter,
0401     which should be a language code of one of the installed spelling
0402     dictionaries. Text encoding used by the dictionary is provided by the
0403     C{encoding} parameter. If the dictionary comes in several varieties,
0404     a non-default one is selected using the C{variety} parameter.
0405     If C{provider} is not given, it will be attempted to fetch it from
0406     C{[enchant]/provider} user configuration field.
0407 
0408     Pology may contain internal supplemental dictionaries for selected
0409     language in C{lang/<lang>/spell/} directory, and these are automatically
0410     picked up. Any subdirectories in C{lang/<lang>/spell/} are considered
0411     as to contain supplemental dictionaries in special "environments"
0412     (e.g. jargon, certain projects, etc.), and are not included by default.
0413     Such environments can be included by the C{envs} parameter, which
0414     is a list of relative paths added to C{lang/<lang>/spell/} directory.
0415     All supplemental dictionaries from such paths are included, as well as
0416     from all their parent directories up to C{lang/<lang>/spell/}
0417     (this makes supplemental dictionaries hierarchical, e.g.
0418     environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar}
0419     is requested, both its and supplements of C{foo} are used).
0420 
0421     If C{lang} is C{None}, then automatic detection of the language based
0422     on the catalog of the message is attempted
0423     (see catalog L{language()<catalog.Catalog.language>} method).
0424     Similar is attempted for environments if C{env} is C{None}
0425     (see catalog L{environment()<catalog.Catalog.environment>} method).
0426     If automatic detection of language does not succeed, finally
0427     C{[enchant]/language} user configuration field is consulted;
0428     for environments, C{[enchant]/environment} field is consulted.
0429 
0430     Provider's system dictionary can be completely excluded from the check
0431     by the C{suponly} parameter, when the check will use only internal
0432     supplemental dictionaries.
0433 
0434     Misspelled words are reported to stdout, with suggestions if available.
0435     Maximum number of suggestions to display is selected by the C{maxsugg}
0436     parameter; if negative, all suggestions are shown.
0437 
0438     Spell checking is performed by internally splitting text into words, and
0439     querying provider word by word. Spliting is performed in a simple fashion;
0440     it is assumed that text has been appropriately filtered down to plain text,
0441     e.g. that any XML-like markup and other literals have been removed
0442     (see L{pology.remove} for filtering possibilities).
0443 
0444     Spell checking can be skipped entirely on a message by issuing
0445     the C{no-check-spell} L{sieve flag<sieve.parse_sieve_flags>}.
0446     Alternatively, only certain words may be declared well spelled
0447     by adding a manual comment starting with C{well-spelled:}
0448     and followed by comma-separated list of words. Example::
0449 
0450         # |, no-check-spell
0451         msgid "Aaaargh, gahhh, khh..."
0452         msgstr ""
0453 
0454         # well-spelled: Aaaargh, kh
0455         msgid "Aaaargh, kh, kh... I have been defeated...!"
0456         msgstr ""
0457 
0458     @param provider: the spell-checking provider to use
0459     @type provider: string
0460     @param lang: language of spelling dictionary
0461     @type lang: string
0462     @param encoding: encoding used by the dictionary
0463     @type encoding: string
0464     @param variety: variety of dictionary
0465     @type variety: string
0466     @param envs: environments for supplemental dictionaries
0467     @type envs: list of strings
0468     @param suponly: whether to use only supplemental dictionaries
0469     @type suponly: bool
0470     @param maxsugg: maximum number of suggestions to show for misspelled word
0471     @type maxsugg: int
0472 
0473     @return: type S3A hook
0474     @rtype: C{(text, msg, cat) -> numerr}
0475     """
0476 
0477     extopts = {}
0478     return _check_spell_w(provider, lang, encoding, variety, extopts,
0479                           envs, suponly, maxsugg, False)
0480 
0481 
0482 def check_spell_ec_sp (provider=None, lang=None, encoding="UTF-8", variety=None,
0483                        envs=None, suponly=False, maxsugg=5):
0484     """
0485     Like L{check_spell_ec}, except that erroneous spans are returned
0486     instead of reporting problems to stdout [hook factory].
0487 
0488     @return: type V3A hook
0489     @rtype: C{(text, msg, cat) -> spans}
0490     """
0491 
0492     extopts = {}
0493     return _check_spell_w(provider, lang, encoding, variety, extopts,
0494                           envs, suponly, maxsugg, True)
0495 
0496 
0497 # Construct Enchant checker for given langenv.
0498 def _construct_enchant (provider, lang, envs, encoding, variety, suponly):
0499 
0500     # Get Pology's internal personal dictonary for this language.
0501     dictpath, temporary = _compose_personal_dict(lang, envs)
0502 
0503     if not suponly:
0504         try:
0505             import enchant
0506         except ImportError:
0507             pkgs = ["python-enchant"]
0508             raise PologyError(_("@info",
0509                                 "Python wrapper for Enchant not found, "
0510                                 "please install it (possible package names: "
0511                                 "%(pkglist)s).",
0512                                 pkglist=format_item_list(pkgs)))
0513 
0514         # Create Enchant broker.
0515         try:
0516             broker = enchant.Broker()
0517         except Exception as e:
0518             raise PologyError(
0519                 _("@info",
0520                   "Cannot initialize Enchant:\n%(msg)s",
0521                   msg=e))
0522 
0523         # Find Enchant language.
0524         e_langs = list(filter(broker.dict_exists, [variety, lang]))
0525         if e_langs:
0526             e_lang = e_langs[0]
0527         else:
0528             if variety is not None:
0529                 raise PologyError(
0530                     _("@info",
0531                       "Language '%(lang)s' and variety '%(var)s' "
0532                       "not known to Enchant.",
0533                       lang=lang, var=variety))
0534             else:
0535                 raise PologyError(
0536                     _("@info",
0537                       "Language '%(lang)s' not known to Enchant.",
0538                       lang=lang))
0539 
0540         # Choose the provider for the selected language.
0541         try:
0542             broker.set_ordering((e_lang or "*"), provider)
0543         except Exception as e:
0544             raise PologyError(
0545                 _("@info",
0546                   "Cannot configure Enchant for provider '%(pvd)s':\n%(msg)s",
0547                   pvd=provider, msg=e))
0548 
0549         # Create checker and test functionality.
0550         try:
0551             if dictpath is None:
0552                 checker = enchant.Dict(e_lang, broker)
0553             else:
0554                 checker = enchant.DictWithPWL(e_lang, dictpath, None, broker)
0555             checker.check(".")
0556         except:
0557             raise PologyError(
0558                 _("@info",
0559                   "Enchant test check for language '%(lang)s' failed.",
0560                   lang=e_lang))
0561     else:
0562         # Create simple internal checker that only checks against
0563         # internal supplemental dictionaries.
0564         if not dictpath:
0565             raise PologyError(
0566                 _("@info",
0567                   "No supplemental dictionaries found."))
0568         checker = _QuasiSpell(dictpath, encoding)
0569 
0570     # Composited dictionary read by now, remove if temporary file.
0571     if temporary:
0572         os.unlink(dictpath)
0573 
0574     return checker
0575 
0576