# -*- coding: UTF-8 -*-
"""
Check spelling in text using different spell checkers.
0006 @author: Chusslove Illich (Часлав Илић) <>
0007 author: Javier Vinal (Javier Viñal) <>
@license: GPLv3
"""
import os
import codecs
import re
import tempfile
from pology import PologyError, datadir, _, n_
from pology.comments import manc_parse_flag_list, manc_parse_list
import pology.config
from pology.msgreport import report_on_msg
from import warning, format_item_list
# Pipe flag to manually prevent spellcheck for a particular message.
flag_no_check_spell = "no-check-spell"
# Embedded list of words manually declared valid for a particular message.
elist_well_spelled = "well-spelled:"
def check_spell (lang=None, encoding="UTF-8", variety=None, extopts={},
envs=None, suponly=False, maxsugg=5):
"""
Check spelling using Aspell [hook factory].
Aspell language is selected by the C{lang} parameter, which should be
a language code of one of the installed spelling dictionaries.
Text encoding used by the dictionary is provided by the C{encoding}
parameter. If the dictionary comes in several varieties, a non-default
one is selected using the C{variety} parameter.
Any additional options from the set of Aspell configuration fields can
be passed in as (name, value) dictionary by the C{extopts} parameter.
Pology may contain internal supplemental dictionaries for selected
language in C{lang/<lang>/spell/} directory, and these are automatically
picked up. Any subdirectories in C{lang/<lang>/spell/} are considered
as to contain supplemental dictionaries in special "environments"
(e.g. jargon, certain projects, etc.), and are not included by default.
Such environments can be included by the C{envs} parameter, which
is a list of relative paths added to C{lang/<lang>/spell/} directory.
All supplemental dictionaries from such paths are included, as well as
from all their parent directories up to C{lang/<lang>/spell/}
(this makes supplemental dictionaries hierarchical, e.g.
environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar}
is requested, both its and supplements of C{foo} are used).
If C{lang} is C{None}, then automatic detection of the language based
on the catalog of the message is attempted
(see catalog L{language()<catalog.Catalog.language>} method).
Similar is attempted for environments if C{env} is C{None}
(see catalog L{environment()<catalog.Catalog.environment>} method).
Aspell's system dictionary can be completely excluded from the check
by the C{suponly} parameter, when the check will use only internal
supplemental dictionaries.
Misspelled words are reported to stdout, with suggestions if available.
Maximum number of suggestions to display is selected by the C{maxsugg}
parameter; if negative, all suggestions are shown.
Spell checking is performed by internally splitting text into words, and
querying Aspell word by word. Spliting is performed in a simple fashion;
it is assumed that text has been appropriately filtered down to plain text,
e.g. that any XML-like markup and other literals have been removed
(see L{pology.remove} for filtering possibilities).
Spell checking can be skipped entirely on a message by issuing
the C{no-check-spell} L{sieve flag<sieve.parse_sieve_flags>}.
Alternatively, only certain words may be declared well spelled
by adding a manual comment starting with C{well-spelled:}
and followed by comma-separated list of words. Example::
# |, no-check-spell
msgid "Aaaargh, gahhh, khh..."
msgstr ""
# well-spelled: Aaaargh, kh
msgid "Aaaargh, kh, kh... I have been defeated...!"
msgstr ""
@param lang: language of spelling dictionary
@type lang: string
@param encoding: encoding used by the dictionary
@type encoding: string
@param variety: variety of dictionary
@type variety: string
@param extopts: additional options to send to Aspell
@type extopts: dict
@param envs: environments for supplemental dictionaries
@type envs: list of strings
@param suponly: whether to use only supplemental dictionaries
@type suponly: bool
@param maxsugg: maximum number of suggestions to show for misspelled word
@type maxsugg: int
@return: type S3A hook
@rtype: C{(text, msg, cat) -> numerr}
"""
provider = "aspell-raw"
return _check_spell_w(provider, lang, encoding, variety, extopts,
envs, suponly, maxsugg, False)
def check_spell_sp (lang=None, encoding="UTF-8", variety=None, extopts={},
envs=None, suponly=False, maxsugg=5):
"""
Like L{check_spell}, except that erroneous spans are returned
instead of reporting problems to stdout [hook factory].
@return: type V3A hook
@rtype: C{(text, msg, cat) -> spans}
"""
provider = "aspell-raw"
return _check_spell_w(provider, lang, encoding, variety, extopts,
envs, suponly, maxsugg, True)
def _check_spell_w (provider, lang, encoding, variety, extopts,
envs, suponly, maxsugg, spanrep):
"""
Worker for C{check_spell*} hook factories.
"""
# FIXME: It is said that no fancy word-splitting is done on the text,
# but still, best to split it assuming plain text?
wsplit_rx = re.compile("[^\W\d_]+", re.U)
def wsplit (text, msg, cat):
word_spans = []
for m in wsplit_rx.finditer(text):
word, span =, m.span()
word_spans.append((word, span))
# ...could have been a single comprehension, but may need expansion.
return word_spans
# Resolve provider.
if provider != "aspell-raw":
enchant_cfg = pology.config.section("enchant")
if not provider:
provider = enchant_cfg.string("provider")
if not provider:
raise PologyError(_("@info", "Enchant provider not set."))
# Cache for constructed checkers.
checkers = {}
# The checker itself.
def spcheck (text, msg, cat):
# Check if new spell checker should be constructed.
if lang is not None:
clang = lang
elif cat.language() is not None:
clang = cat.language()
elif provider != "aspell-raw":
clang = enchant_cfg.string("language")
else:
clang = None
if not clang:
raise PologyError(
_("@info",
"Cannot determine language for catalog '%(file)s'.",
file=cat.filename))
if envs is not None:
cenvs = envs
elif cat.environment() is not None:
cenvs = cat.environment()
elif provider != "aspell-raw":
envs_str = enchant_cfg.string("environment")
cenvs = envs_str.split(",") if envs_str else []
else:
cenvs = []
ckey = (clang, tuple(cenvs))
if ckey not in checkers:
if provider != "aspell-raw":
checkers[ckey] = _construct_enchant(provider, clang, cenvs,
encoding, variety, suponly)
else:
checkers[ckey] = _construct_aspell(clang, cenvs, encoding,
variety, extopts, suponly)
checker = checkers[ckey]
# Prepare shortcut reports.
if spanrep: defret = []
else: defret = 0
# Skip message if explicitly requested.
if flag_no_check_spell in manc_parse_flag_list(msg, "|"):
return defret
# Split text into words and spans: [(word, (start, end)), ...]
word_spans = wsplit(text, msg, cat)
# Ignore words explicitly listed as good.
ignored_words = set(manc_parse_list(msg, elist_well_spelled, ","))
word_spans = [x for x in word_spans if x[0] not in ignored_words]
spans = []
for word, span in word_spans:
encword = word.encode(encoding)
if not checker.check(encword):
encsuggs = checker.suggest(encword)
maxsugg = 5 # limit to some reasonable number
incmp = False
0216                 if maxsugg > 0 and len(encsuggs) > maxsugg:
0217                     encsuggs = encsuggs[:maxsugg]
0218                     incmp = True
0219                 suggs = [x.decode(encoding) for x in encsuggs]
0220                 if maxsugg != 0 and suggs:
0221                     fmtsuggs = format_item_list(suggs, incmp=incmp)
0222                     snote = _("@info",
0223                               "Unknown word '%(word)s' "
0224                               "(suggestions: %(wordlist)s).",
0225                               word=word, wordlist=fmtsuggs)
0226                 else:
0227                     snote = _("@info",
0228                               "Unknown word '%(word)s'.",
0229                               word=word)
0230                 spans.append(span + (snote,))
0232         if spanrep:
0233             return spans
0234         else:
0235             for span in spans:
0236                 if span[2:]:
0237                     report_on_msg(span[2], msg, cat)
0238             return len(spans)
0240     return spcheck
0243 # Construct Aspell checker for given langenv.
0244 def _construct_aspell (lang, envs, encoding, variety, extopts, suponly):
0246     # Get Pology's internal personal dictonary for this language.
0247     dictpath, temporary = _compose_personal_dict(lang, envs)
0249     if not suponly:
0250         # Prepare Aspell options.
0251         aopts = {}
0252         aopts["lang"] = lang
0253         aopts["encoding"] = encoding
0254         if variety:
0255             aopts["variety"] = variety
0256         if dictpath:
0257             aopts["personal-path"] = dictpath
0258         if extopts:
0259             aopts.update(extopts)
0261         aopts = dict([(x, y.encode(encoding)) for x, y in list(aopts.items())])
0263         # Create Aspell object.
0264         import pology.external.pyaspell as A
0265         try:
0266             checker = A.Aspell(list(aopts.items()))
0267         except A.AspellConfigError as e:
0268             raise PologyError(
0269                 _("@info",
0270                   "Aspell configuration error:\n%(msg)s",
0271                   msg=e))
0272         except A.AspellError as e:
0273             raise PologyError(
0274                 _("@info",
0275                   "Cannot initialize Aspell:\n%(msg)s",
0276                   msg=e))
0277     else:
0278         # Create simple internal checker that only checks against
0279         # internal supplemental dictionaries.
0280         if not dictpath:
0281             raise PologyError(
0282                 _("@info",
0283                   "No supplemental dictionaries found."))
0284         checker = _QuasiSpell(dictpath, encoding)
0286     # Composited dictionary read by now, remove if temporary file.
0287     if temporary:
0288         os.unlink(dictpath)
0290     return checker
0293 # Collect all personal dictionaries found for given language/environment
0294 # and composit them into one file to pass to Aspell.
0295 # Environment is given as a relative subpath into the language directory;
0296 # a dictionary belongs to that environment if it is in the directory
0297 # pointed by the subpath, or any of the parent directories.
0298 # Return the path to composited file or None if there were no dictionaries,
0299 # and whether the file is really a temporary composition or not.
0300 def _compose_personal_dict (lang, envs):
0302     # Collect all applicable dictionary files
0303     # (for a given environment, in its subdirectiory and all above).
0304     dictpaths = set()
0305     spell_root = os.path.join(datadir(), "lang", lang, "spell")
0306     for env in (envs or [""]):
0307         spell_sub = os.path.join(".", env)
0308         while spell_sub:
0309             spell_dir = os.path.join(spell_root, spell_sub)
0310             if os.path.isdir(spell_dir):
0311                 for item in os.listdir(spell_dir):
0312                     if item.endswith(".aspell"):
0313                         dictpaths.add(os.path.join(spell_dir, item))
0314             spell_sub = os.path.dirname(spell_sub)
0315     dictpaths = list(dictpaths)
0316     dictpaths.sort()
0318     if not dictpaths:
0319         return None, False
0321     # If only one dictionary found, Aspell can use it as-is.
0322     if len(dictpaths) == 1:
0323         return dictpaths[0], False
0325     # Composit all dictionary files into one temporary.
0326     words = []
0327     for dictpath in dictpaths:
0328         words.extend(_read_dict_file(dictpath))
0329     tmpf = tempfile.NamedTemporaryFile()
0330     tmpf.close()
0331     try:
0332         tmpf =, "w", "UTF-8")
0333         tmpf.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words)))
0334         tmpf.writelines([x + "\n" for x in words])
0335         tmpf.close()
0336     except Exception as e:
0337         raise PologyError(
0338             _("@info",
0339               "Cannot create composited spelling dictionary "
0340               "in current working directory:\n%(msg)s",
0341               msg=e))
0343     return, True
0346 # Read words from Aspell personal dictionary.
0347 def _read_dict_file (filepath):
0349     # Parse the header for encoding.
0350     enc_def = "UTF-8"
0351     file =, "r", enc_def)
0352     header = file.readline()
0353     m ="^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header)
0354     if not m:
0355         raise PologyError(
0356             _("@info",
0357               "Malformed header in dictionary file '%(file)s'.",
0358               file=filepath))
0359     enc =
0360     # Reopen in correct encoding if not the default.
0361     if enc.lower() != enc_def.lower():
0362         file.close()
0363         file =, "r", enc)
0365     # Read words.
0366     words = []
0367     for line in file:
0368         word = line.strip()
0369         if word:
0370             words.append(word)
0371     return words
0374 # Simple spell checker which reads Aspell's personal dictionary file.
0375 class _QuasiSpell (object):
0377     def __init__ (self, dictpath, enc="UTF-8"):
0379         self._words = _read_dict_file(dictpath)
0380         self._enc = enc # of the raw text sent in for checking
0383     def check (self, encword):
0385         word = str.decode(encword, self._enc)
0386         return (   word in self._words
0387                 or word.lower() in self._words)
0390     def suggest (self, encword):
0392         return []
0395 def check_spell_ec (provider=None, lang=None, encoding="UTF-8", variety=None,
0396                     envs=None, suponly=False, maxsugg=5):
0397     """
0398     Check spelling using Enchant [hook factory].
0400     Enchant provider and language are selected by the C{lang} parameter,
0401     which should be a language code of one of the installed spelling
0402     dictionaries. Text encoding used by the dictionary is provided by the
0403     C{encoding} parameter. If the dictionary comes in several varieties,
0404     a non-default one is selected using the C{variety} parameter.
0405     If C{provider} is not given, it will be attempted to fetch it from
0406     C{[enchant]/provider} user configuration field.
0408     Pology may contain internal supplemental dictionaries for selected
0409     language in C{lang/<lang>/spell/} directory, and these are automatically
0410     picked up. Any subdirectories in C{lang/<lang>/spell/} are considered
0411     as to contain supplemental dictionaries in special "environments"
0412     (e.g. jargon, certain projects, etc.), and are not included by default.
0413     Such environments can be included by the C{envs} parameter, which
0414     is a list of relative paths added to C{lang/<lang>/spell/} directory.
0415     All supplemental dictionaries from such paths are included, as well as
0416     from all their parent directories up to C{lang/<lang>/spell/}
0417     (this makes supplemental dictionaries hierarchical, e.g.
0418     environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar}
0419     is requested, both its and supplements of C{foo} are used).
0421     If C{lang} is C{None}, then automatic detection of the language based
0422     on the catalog of the message is attempted
0423     (see catalog L{language()<catalog.Catalog.language>} method).
0424     Similar is attempted for environments if C{env} is C{None}
0425     (see catalog L{environment()<catalog.Catalog.environment>} method).
0426     If automatic detection of language does not succeed, finally
0427     C{[enchant]/language} user configuration field is consulted;
0428     for environments, C{[enchant]/environment} field is consulted.
0430     Provider's system dictionary can be completely excluded from the check
0431     by the C{suponly} parameter, when the check will use only internal
0432     supplemental dictionaries.
0434     Misspelled words are reported to stdout, with suggestions if available.
0435     Maximum number of suggestions to display is selected by the C{maxsugg}
0436     parameter; if negative, all suggestions are shown.
0438     Spell checking is performed by internally splitting text into words, and
0439     querying provider word by word. Spliting is performed in a simple fashion;
0440     it is assumed that text has been appropriately filtered down to plain text,
0441     e.g. that any XML-like markup and other literals have been removed
0442     (see L{pology.remove} for filtering possibilities).
0444     Spell checking can be skipped entirely on a message by issuing
0445     the C{no-check-spell} L{sieve flag<sieve.parse_sieve_flags>}.
0446     Alternatively, only certain words may be declared well spelled
0447     by adding a manual comment starting with C{well-spelled:}
0448     and followed by comma-separated list of words. Example::
0450         # |, no-check-spell
0451         msgid "Aaaargh, gahhh, khh..."
0452         msgstr ""
0454         # well-spelled: Aaaargh, kh
0455         msgid "Aaaargh, kh, kh... I have been defeated...!"
0456         msgstr ""
0458     @param provider: the spell-checking provider to use
0459     @type provider: string
0460     @param lang: language of spelling dictionary
0461     @type lang: string
0462     @param encoding: encoding used by the dictionary
0463     @type encoding: string
0464     @param variety: variety of dictionary
0465     @type variety: string
0466     @param envs: environments for supplemental dictionaries
0467     @type envs: list of strings
0468     @param suponly: whether to use only supplemental dictionaries
0469     @type suponly: bool
0470     @param maxsugg: maximum number of suggestions to show for misspelled word
0471     @type maxsugg: int
0473     @return: type S3A hook
0474     @rtype: C{(text, msg, cat) -> numerr}
0475     """
0477     extopts = {}
0478     return _check_spell_w(provider, lang, encoding, variety, extopts,
0479                           envs, suponly, maxsugg, False)
0482 def check_spell_ec_sp (provider=None, lang=None, encoding="UTF-8", variety=None,
0483                        envs=None, suponly=False, maxsugg=5):
0484     """
0485     Like L{check_spell_ec}, except that erroneous spans are returned
0486     instead of reporting problems to stdout [hook factory].
0488     @return: type V3A hook
0489     @rtype: C{(text, msg, cat) -> spans}
0490     """
0492     extopts = {}
0493     return _check_spell_w(provider, lang, encoding, variety, extopts,
0494                           envs, suponly, maxsugg, True)
0497 # Construct Enchant checker for given langenv.
0498 def _construct_enchant (provider, lang, envs, encoding, variety, suponly):
0500     # Get Pology's internal personal dictonary for this language.
0501     dictpath, temporary = _compose_personal_dict(lang, envs)
0503     if not suponly:
0504         try:
0505             import enchant
0506         except ImportError:
0507             pkgs = ["python-enchant"]
0508             raise PologyError(_("@info",
0509                                 "Python wrapper for Enchant not found, "
0510                                 "please install it (possible package names: "
0511                                 "%(pkglist)s).",
0512                                 pkglist=format_item_list(pkgs)))
0514         # Create Enchant broker.
0515         try:
0516             broker = enchant.Broker()
0517         except Exception as e:
0518             raise PologyError(
0519                 _("@info",
0520                   "Cannot initialize Enchant:\n%(msg)s",
0521                   msg=e))
0523         # Find Enchant language.
0524         e_langs = list(filter(broker.dict_exists, [variety, lang]))
0525         if e_langs:
0526             e_lang = e_langs[0]
0527         else:
0528             if variety is not None:
0529                 raise PologyError(
0530                     _("@info",
0531                       "Language '%(lang)s' and variety '%(var)s' "
0532                       "not known to Enchant.",
0533                       lang=lang, var=variety))
0534             else:
0535                 raise PologyError(
0536                     _("@info",
0537                       "Language '%(lang)s' not known to Enchant.",
0538                       lang=lang))
0540         # Choose the provider for the selected language.
0541         try:
0542             broker.set_ordering((e_lang or "*"), provider)
0543         except Exception as e:
0544             raise PologyError(
0545                 _("@info",
0546                   "Cannot configure Enchant for provider '%(pvd)s':\n%(msg)s",
0547                   pvd=provider, msg=e))
0549         # Create checker and test functionality.
0550         try:
0551             if dictpath is None:
0552                 checker = enchant.Dict(e_lang, broker)
0553             else:
0554                 checker = enchant.DictWithPWL(e_lang, dictpath, None, broker)
0555             checker.check(".")
0556         except:
0557             raise PologyError(
0558                 _("@info",
0559                   "Enchant test check for language '%(lang)s' failed.",
0560                   lang=e_lang))
0561     else:
0562         # Create simple internal checker that only checks against
0563         # internal supplemental dictionaries.
0564         if not dictpath:
0565             raise PologyError(
0566                 _("@info",
0567                   "No supplemental dictionaries found."))
0568         checker = _QuasiSpell(dictpath, encoding)
0570     # Composited dictionary read by now, remove if temporary file.
0571     if temporary:
0572         os.unlink(dictpath)
0574     return checker