File indexing completed on 2024-03-24 05:47:40
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Check spelling in text using different spell checkers. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 author: Javier Vinal (Javier Viñal) <fjvinal@gmail.com> 0008 @license: GPLv3 0009 """ 0010 0011 import os 0012 import codecs 0013 import re 0014 import tempfile 0015 0016 from pology import PologyError, datadir, _, n_ 0017 from pology.comments import manc_parse_flag_list, manc_parse_list 0018 import pology.config 0019 from pology.msgreport import report_on_msg 0020 from pology.report import warning, format_item_list 0021 0022 0023 # Pipe flag to manually prevent spellcheck for a particular message. 0024 flag_no_check_spell = "no-check-spell" 0025 0026 # Embedded list of words manually declared valid for a particular message. 0027 elist_well_spelled = "well-spelled:" 0028 0029 0030 def check_spell (lang=None, encoding="UTF-8", variety=None, extopts={}, 0031 envs=None, suponly=False, maxsugg=5): 0032 """ 0033 Check spelling using Aspell [hook factory]. 0034 0035 Aspell language is selected by the C{lang} parameter, which should be 0036 a language code of one of the installed spelling dictionaries. 0037 Text encoding used by the dictionary is provided by the C{encoding} 0038 parameter. If the dictionary comes in several varieties, a non-default 0039 one is selected using the C{variety} parameter. 0040 Any additional options from the set of Aspell configuration fields can 0041 be passed in as (name, value) dictionary by the C{extopts} parameter. 0042 0043 Pology may contain internal supplemental dictionaries for selected 0044 language in C{lang/<lang>/spell/} directory, and these are automatically 0045 picked up. Any subdirectories in C{lang/<lang>/spell/} are considered 0046 as to contain supplemental dictionaries in special "environments" 0047 (e.g. jargon, certain projects, etc.), and are not included by default. 0048 Such environments can be included by the C{envs} parameter, which 0049 is a list of relative paths added to C{lang/<lang>/spell/} directory. 0050 All supplemental dictionaries from such paths are included, as well as 0051 from all their parent directories up to C{lang/<lang>/spell/} 0052 (this makes supplemental dictionaries hierarchical, e.g. 0053 environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar} 0054 is requested, both its and supplements of C{foo} are used). 0055 0056 If C{lang} is C{None}, then automatic detection of the language based 0057 on the catalog of the message is attempted 0058 (see catalog L{language()<catalog.Catalog.language>} method). 0059 Similar is attempted for environments if C{env} is C{None} 0060 (see catalog L{environment()<catalog.Catalog.environment>} method). 0061 0062 Aspell's system dictionary can be completely excluded from the check 0063 by the C{suponly} parameter, when the check will use only internal 0064 supplemental dictionaries. 0065 0066 Misspelled words are reported to stdout, with suggestions if available. 0067 Maximum number of suggestions to display is selected by the C{maxsugg} 0068 parameter; if negative, all suggestions are shown. 0069 0070 Spell checking is performed by internally splitting text into words, and 0071 querying Aspell word by word. Spliting is performed in a simple fashion; 0072 it is assumed that text has been appropriately filtered down to plain text, 0073 e.g. that any XML-like markup and other literals have been removed 0074 (see L{pology.remove} for filtering possibilities). 0075 0076 Spell checking can be skipped entirely on a message by issuing 0077 the C{no-check-spell} L{sieve flag<sieve.parse_sieve_flags>}. 0078 Alternatively, only certain words may be declared well spelled 0079 by adding a manual comment starting with C{well-spelled:} 0080 and followed by comma-separated list of words. Example:: 0081 0082 # |, no-check-spell 0083 msgid "Aaaargh, gahhh, khh..." 0084 msgstr "" 0085 0086 # well-spelled: Aaaargh, kh 0087 msgid "Aaaargh, kh, kh... I have been defeated...!" 0088 msgstr "" 0089 0090 @param lang: language of spelling dictionary 0091 @type lang: string 0092 @param encoding: encoding used by the dictionary 0093 @type encoding: string 0094 @param variety: variety of dictionary 0095 @type variety: string 0096 @param extopts: additional options to send to Aspell 0097 @type extopts: dict 0098 @param envs: environments for supplemental dictionaries 0099 @type envs: list of strings 0100 @param suponly: whether to use only supplemental dictionaries 0101 @type suponly: bool 0102 @param maxsugg: maximum number of suggestions to show for misspelled word 0103 @type maxsugg: int 0104 0105 @return: type S3A hook 0106 @rtype: C{(text, msg, cat) -> numerr} 0107 """ 0108 0109 provider = "aspell-raw" 0110 return _check_spell_w(provider, lang, encoding, variety, extopts, 0111 envs, suponly, maxsugg, False) 0112 0113 0114 def check_spell_sp (lang=None, encoding="UTF-8", variety=None, extopts={}, 0115 envs=None, suponly=False, maxsugg=5): 0116 """ 0117 Like L{check_spell}, except that erroneous spans are returned 0118 instead of reporting problems to stdout [hook factory]. 0119 0120 @return: type V3A hook 0121 @rtype: C{(text, msg, cat) -> spans} 0122 """ 0123 0124 provider = "aspell-raw" 0125 return _check_spell_w(provider, lang, encoding, variety, extopts, 0126 envs, suponly, maxsugg, True) 0127 0128 0129 def _check_spell_w (provider, lang, encoding, variety, extopts, 0130 envs, suponly, maxsugg, spanrep): 0131 """ 0132 Worker for C{check_spell*} hook factories. 0133 """ 0134 0135 # FIXME: It is said that no fancy word-splitting is done on the text, 0136 # but still, best to split it assuming plain text? 0137 wsplit_rx = re.compile("[^\W\d_]+", re.U) 0138 def wsplit (text, msg, cat): 0139 word_spans = [] 0140 for m in wsplit_rx.finditer(text): 0141 word, span = m.group(0), m.span() 0142 word_spans.append((word, span)) 0143 # ...could have been a single comprehension, but may need expansion. 0144 return word_spans 0145 0146 # Resolve provider. 0147 if provider != "aspell-raw": 0148 enchant_cfg = pology.config.section("enchant") 0149 if not provider: 0150 provider = enchant_cfg.string("provider") 0151 if not provider: 0152 raise PologyError(_("@info", "Enchant provider not set.")) 0153 0154 # Cache for constructed checkers. 0155 checkers = {} 0156 0157 # The checker itself. 0158 def spcheck (text, msg, cat): 0159 0160 # Check if new spell checker should be constructed. 0161 if lang is not None: 0162 clang = lang 0163 elif cat.language() is not None: 0164 clang = cat.language() 0165 elif provider != "aspell-raw": 0166 clang = enchant_cfg.string("language") 0167 else: 0168 clang = None 0169 if not clang: 0170 raise PologyError( 0171 _("@info", 0172 "Cannot determine language for catalog '%(file)s'.", 0173 file=cat.filename)) 0174 if envs is not None: 0175 cenvs = envs 0176 elif cat.environment() is not None: 0177 cenvs = cat.environment() 0178 elif provider != "aspell-raw": 0179 envs_str = enchant_cfg.string("environment") 0180 cenvs = envs_str.split(",") if envs_str else [] 0181 else: 0182 cenvs = [] 0183 ckey = (clang, tuple(cenvs)) 0184 if ckey not in checkers: 0185 if provider != "aspell-raw": 0186 checkers[ckey] = _construct_enchant(provider, clang, cenvs, 0187 encoding, variety, suponly) 0188 else: 0189 checkers[ckey] = _construct_aspell(clang, cenvs, encoding, 0190 variety, extopts, suponly) 0191 0192 checker = checkers[ckey] 0193 0194 # Prepare shortcut reports. 0195 if spanrep: defret = [] 0196 else: defret = 0 0197 0198 # Skip message if explicitly requested. 0199 if flag_no_check_spell in manc_parse_flag_list(msg, "|"): 0200 return defret 0201 0202 # Split text into words and spans: [(word, (start, end)), ...] 0203 word_spans = wsplit(text, msg, cat) 0204 0205 # Ignore words explicitly listed as good. 0206 ignored_words = set(manc_parse_list(msg, elist_well_spelled, ",")) 0207 word_spans = [x for x in word_spans if x[0] not in ignored_words] 0208 0209 spans = [] 0210 for word, span in word_spans: 0211 encword = word.encode(encoding) 0212 if not checker.check(encword): 0213 encsuggs = checker.suggest(encword) 0214 maxsugg = 5 # limit to some reasonable number 0215 incmp = False 0216 if maxsugg > 0 and len(encsuggs) > maxsugg: 0217 encsuggs = encsuggs[:maxsugg] 0218 incmp = True 0219 suggs = [x.decode(encoding) for x in encsuggs] 0220 if maxsugg != 0 and suggs: 0221 fmtsuggs = format_item_list(suggs, incmp=incmp) 0222 snote = _("@info", 0223 "Unknown word '%(word)s' " 0224 "(suggestions: %(wordlist)s).", 0225 word=word, wordlist=fmtsuggs) 0226 else: 0227 snote = _("@info", 0228 "Unknown word '%(word)s'.", 0229 word=word) 0230 spans.append(span + (snote,)) 0231 0232 if spanrep: 0233 return spans 0234 else: 0235 for span in spans: 0236 if span[2:]: 0237 report_on_msg(span[2], msg, cat) 0238 return len(spans) 0239 0240 return spcheck 0241 0242 0243 # Construct Aspell checker for given langenv. 0244 def _construct_aspell (lang, envs, encoding, variety, extopts, suponly): 0245 0246 # Get Pology's internal personal dictonary for this language. 0247 dictpath, temporary = _compose_personal_dict(lang, envs) 0248 0249 if not suponly: 0250 # Prepare Aspell options. 0251 aopts = {} 0252 aopts["lang"] = lang 0253 aopts["encoding"] = encoding 0254 if variety: 0255 aopts["variety"] = variety 0256 if dictpath: 0257 aopts["personal-path"] = dictpath 0258 if extopts: 0259 aopts.update(extopts) 0260 0261 aopts = dict([(x, y.encode(encoding)) for x, y in list(aopts.items())]) 0262 0263 # Create Aspell object. 0264 import pology.external.pyaspell as A 0265 try: 0266 checker = A.Aspell(list(aopts.items())) 0267 except A.AspellConfigError as e: 0268 raise PologyError( 0269 _("@info", 0270 "Aspell configuration error:\n%(msg)s", 0271 msg=e)) 0272 except A.AspellError as e: 0273 raise PologyError( 0274 _("@info", 0275 "Cannot initialize Aspell:\n%(msg)s", 0276 msg=e)) 0277 else: 0278 # Create simple internal checker that only checks against 0279 # internal supplemental dictionaries. 0280 if not dictpath: 0281 raise PologyError( 0282 _("@info", 0283 "No supplemental dictionaries found.")) 0284 checker = _QuasiSpell(dictpath, encoding) 0285 0286 # Composited dictionary read by now, remove if temporary file. 0287 if temporary: 0288 os.unlink(dictpath) 0289 0290 return checker 0291 0292 0293 # Collect all personal dictionaries found for given language/environment 0294 # and composit them into one file to pass to Aspell. 0295 # Environment is given as a relative subpath into the language directory; 0296 # a dictionary belongs to that environment if it is in the directory 0297 # pointed by the subpath, or any of the parent directories. 0298 # Return the path to composited file or None if there were no dictionaries, 0299 # and whether the file is really a temporary composition or not. 0300 def _compose_personal_dict (lang, envs): 0301 0302 # Collect all applicable dictionary files 0303 # (for a given environment, in its subdirectiory and all above). 0304 dictpaths = set() 0305 spell_root = os.path.join(datadir(), "lang", lang, "spell") 0306 for env in (envs or [""]): 0307 spell_sub = os.path.join(".", env) 0308 while spell_sub: 0309 spell_dir = os.path.join(spell_root, spell_sub) 0310 if os.path.isdir(spell_dir): 0311 for item in os.listdir(spell_dir): 0312 if item.endswith(".aspell"): 0313 dictpaths.add(os.path.join(spell_dir, item)) 0314 spell_sub = os.path.dirname(spell_sub) 0315 dictpaths = list(dictpaths) 0316 dictpaths.sort() 0317 0318 if not dictpaths: 0319 return None, False 0320 0321 # If only one dictionary found, Aspell can use it as-is. 0322 if len(dictpaths) == 1: 0323 return dictpaths[0], False 0324 0325 # Composit all dictionary files into one temporary. 0326 words = [] 0327 for dictpath in dictpaths: 0328 words.extend(_read_dict_file(dictpath)) 0329 tmpf = tempfile.NamedTemporaryFile() 0330 tmpf.close() 0331 try: 0332 tmpf = codecs.open(tmpf.name, "w", "UTF-8") 0333 tmpf.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words))) 0334 tmpf.writelines([x + "\n" for x in words]) 0335 tmpf.close() 0336 except Exception as e: 0337 raise PologyError( 0338 _("@info", 0339 "Cannot create composited spelling dictionary " 0340 "in current working directory:\n%(msg)s", 0341 msg=e)) 0342 0343 return tmpf.name, True 0344 0345 0346 # Read words from Aspell personal dictionary. 0347 def _read_dict_file (filepath): 0348 0349 # Parse the header for encoding. 0350 enc_def = "UTF-8" 0351 file = codecs.open(filepath, "r", enc_def) 0352 header = file.readline() 0353 m = re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) 0354 if not m: 0355 raise PologyError( 0356 _("@info", 0357 "Malformed header in dictionary file '%(file)s'.", 0358 file=filepath)) 0359 enc = m.group(4) 0360 # Reopen in correct encoding if not the default. 0361 if enc.lower() != enc_def.lower(): 0362 file.close() 0363 file = codecs.open(filepath, "r", enc) 0364 0365 # Read words. 0366 words = [] 0367 for line in file: 0368 word = line.strip() 0369 if word: 0370 words.append(word) 0371 return words 0372 0373 0374 # Simple spell checker which reads Aspell's personal dictionary file. 0375 class _QuasiSpell (object): 0376 0377 def __init__ (self, dictpath, enc="UTF-8"): 0378 0379 self._words = _read_dict_file(dictpath) 0380 self._enc = enc # of the raw text sent in for checking 0381 0382 0383 def check (self, encword): 0384 0385 word = str.decode(encword, self._enc) 0386 return ( word in self._words 0387 or word.lower() in self._words) 0388 0389 0390 def suggest (self, encword): 0391 0392 return [] 0393 0394 0395 def check_spell_ec (provider=None, lang=None, encoding="UTF-8", variety=None, 0396 envs=None, suponly=False, maxsugg=5): 0397 """ 0398 Check spelling using Enchant [hook factory]. 0399 0400 Enchant provider and language are selected by the C{lang} parameter, 0401 which should be a language code of one of the installed spelling 0402 dictionaries. Text encoding used by the dictionary is provided by the 0403 C{encoding} parameter. If the dictionary comes in several varieties, 0404 a non-default one is selected using the C{variety} parameter. 0405 If C{provider} is not given, it will be attempted to fetch it from 0406 C{[enchant]/provider} user configuration field. 0407 0408 Pology may contain internal supplemental dictionaries for selected 0409 language in C{lang/<lang>/spell/} directory, and these are automatically 0410 picked up. Any subdirectories in C{lang/<lang>/spell/} are considered 0411 as to contain supplemental dictionaries in special "environments" 0412 (e.g. jargon, certain projects, etc.), and are not included by default. 0413 Such environments can be included by the C{envs} parameter, which 0414 is a list of relative paths added to C{lang/<lang>/spell/} directory. 0415 All supplemental dictionaries from such paths are included, as well as 0416 from all their parent directories up to C{lang/<lang>/spell/} 0417 (this makes supplemental dictionaries hierarchical, e.g. 0418 environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar} 0419 is requested, both its and supplements of C{foo} are used). 0420 0421 If C{lang} is C{None}, then automatic detection of the language based 0422 on the catalog of the message is attempted 0423 (see catalog L{language()<catalog.Catalog.language>} method). 0424 Similar is attempted for environments if C{env} is C{None} 0425 (see catalog L{environment()<catalog.Catalog.environment>} method). 0426 If automatic detection of language does not succeed, finally 0427 C{[enchant]/language} user configuration field is consulted; 0428 for environments, C{[enchant]/environment} field is consulted. 0429 0430 Provider's system dictionary can be completely excluded from the check 0431 by the C{suponly} parameter, when the check will use only internal 0432 supplemental dictionaries. 0433 0434 Misspelled words are reported to stdout, with suggestions if available. 0435 Maximum number of suggestions to display is selected by the C{maxsugg} 0436 parameter; if negative, all suggestions are shown. 0437 0438 Spell checking is performed by internally splitting text into words, and 0439 querying provider word by word. Spliting is performed in a simple fashion; 0440 it is assumed that text has been appropriately filtered down to plain text, 0441 e.g. that any XML-like markup and other literals have been removed 0442 (see L{pology.remove} for filtering possibilities). 0443 0444 Spell checking can be skipped entirely on a message by issuing 0445 the C{no-check-spell} L{sieve flag<sieve.parse_sieve_flags>}. 0446 Alternatively, only certain words may be declared well spelled 0447 by adding a manual comment starting with C{well-spelled:} 0448 and followed by comma-separated list of words. Example:: 0449 0450 # |, no-check-spell 0451 msgid "Aaaargh, gahhh, khh..." 0452 msgstr "" 0453 0454 # well-spelled: Aaaargh, kh 0455 msgid "Aaaargh, kh, kh... I have been defeated...!" 0456 msgstr "" 0457 0458 @param provider: the spell-checking provider to use 0459 @type provider: string 0460 @param lang: language of spelling dictionary 0461 @type lang: string 0462 @param encoding: encoding used by the dictionary 0463 @type encoding: string 0464 @param variety: variety of dictionary 0465 @type variety: string 0466 @param envs: environments for supplemental dictionaries 0467 @type envs: list of strings 0468 @param suponly: whether to use only supplemental dictionaries 0469 @type suponly: bool 0470 @param maxsugg: maximum number of suggestions to show for misspelled word 0471 @type maxsugg: int 0472 0473 @return: type S3A hook 0474 @rtype: C{(text, msg, cat) -> numerr} 0475 """ 0476 0477 extopts = {} 0478 return _check_spell_w(provider, lang, encoding, variety, extopts, 0479 envs, suponly, maxsugg, False) 0480 0481 0482 def check_spell_ec_sp (provider=None, lang=None, encoding="UTF-8", variety=None, 0483 envs=None, suponly=False, maxsugg=5): 0484 """ 0485 Like L{check_spell_ec}, except that erroneous spans are returned 0486 instead of reporting problems to stdout [hook factory]. 0487 0488 @return: type V3A hook 0489 @rtype: C{(text, msg, cat) -> spans} 0490 """ 0491 0492 extopts = {} 0493 return _check_spell_w(provider, lang, encoding, variety, extopts, 0494 envs, suponly, maxsugg, True) 0495 0496 0497 # Construct Enchant checker for given langenv. 0498 def _construct_enchant (provider, lang, envs, encoding, variety, suponly): 0499 0500 # Get Pology's internal personal dictonary for this language. 0501 dictpath, temporary = _compose_personal_dict(lang, envs) 0502 0503 if not suponly: 0504 try: 0505 import enchant 0506 except ImportError: 0507 pkgs = ["python-enchant"] 0508 raise PologyError(_("@info", 0509 "Python wrapper for Enchant not found, " 0510 "please install it (possible package names: " 0511 "%(pkglist)s).", 0512 pkglist=format_item_list(pkgs))) 0513 0514 # Create Enchant broker. 0515 try: 0516 broker = enchant.Broker() 0517 except Exception as e: 0518 raise PologyError( 0519 _("@info", 0520 "Cannot initialize Enchant:\n%(msg)s", 0521 msg=e)) 0522 0523 # Find Enchant language. 0524 e_langs = list(filter(broker.dict_exists, [variety, lang])) 0525 if e_langs: 0526 e_lang = e_langs[0] 0527 else: 0528 if variety is not None: 0529 raise PologyError( 0530 _("@info", 0531 "Language '%(lang)s' and variety '%(var)s' " 0532 "not known to Enchant.", 0533 lang=lang, var=variety)) 0534 else: 0535 raise PologyError( 0536 _("@info", 0537 "Language '%(lang)s' not known to Enchant.", 0538 lang=lang)) 0539 0540 # Choose the provider for the selected language. 0541 try: 0542 broker.set_ordering((e_lang or "*"), provider) 0543 except Exception as e: 0544 raise PologyError( 0545 _("@info", 0546 "Cannot configure Enchant for provider '%(pvd)s':\n%(msg)s", 0547 pvd=provider, msg=e)) 0548 0549 # Create checker and test functionality. 0550 try: 0551 if dictpath is None: 0552 checker = enchant.Dict(e_lang, broker) 0553 else: 0554 checker = enchant.DictWithPWL(e_lang, dictpath, None, broker) 0555 checker.check(".") 0556 except: 0557 raise PologyError( 0558 _("@info", 0559 "Enchant test check for language '%(lang)s' failed.", 0560 lang=e_lang)) 0561 else: 0562 # Create simple internal checker that only checks against 0563 # internal supplemental dictionaries. 0564 if not dictpath: 0565 raise PologyError( 0566 _("@info", 0567 "No supplemental dictionaries found.")) 0568 checker = _QuasiSpell(dictpath, encoding) 0569 0570 # Composited dictionary read by now, remove if temporary file. 0571 if temporary: 0572 os.unlink(dictpath) 0573 0574 return checker 0575 0576