File indexing completed on 2024-12-01 13:47:52
0001 #!/usr/bin/env python3 0002 # -*- coding: UTF-8 -*- 0003 0004 """ 0005 Organize dictionary file: 0006 - sort entries 0007 - remove duplicate 0008 - update header 0009 0010 This script is intended to be run standalone. 0011 0012 Usage:: 0013 python <dict file> 0014 0015 @author: Sébastien Renard <sebastien.renard@digitalfox.org> 0016 @license: GPLv3 0017 """ 0018 0019 import locale 0020 from codecs import open 0021 from os.path import abspath, basename 0022 import re 0023 import sys 0024 0025 try: 0026 import fallback_import_paths 0027 except: 0028 pass 0029 from pology import _, n_ 0030 from pology.report import report, error 0031 0032 0033 def main(): 0034 0035 locale.setlocale(locale.LC_ALL, "") 0036 0037 # FIXME: Use pology.colors.ColorOptionParser. 0038 reminv=False 0039 paths=[] 0040 for arg in sys.argv[1:]: 0041 if arg.startswith("-"): 0042 if arg in ("-r", "--remove-invalid"): 0043 reminv = True 0044 else: 0045 error(_("@info", 0046 "Unknown option '%(opt)s' in command line.", 0047 opt=arg)) 0048 else: 0049 paths.append(arg) 0050 if len(paths)<1: 0051 usage() 0052 0053 for path in paths: 0054 organize(path, reminv) 0055 0056 0057 def organize (dictPath, reminv=False): 0058 0059 report(dictPath) 0060 dictEncDefault = "UTF-8" 0061 dictFile=open(dictPath, "r", dictEncDefault) 0062 0063 # Parse the header for language and encoding. 0064 header=dictFile.readline() 0065 m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) 0066 if not m: 0067 error(_("@info", 0068 "Malformed header of the dictionary file '%(file)s'.", 0069 file=dictPath)) 0070 dictType, dictLang, numWords, dictEnc=m.groups() 0071 0072 expDictType = "personal_ws-1.1" 0073 if dictType != expDictType: 0074 dictType = expDictType 0075 report(" " + _("@item:inlist", 0076 "dictionary type changed to '%(dtype)s'", 0077 dtype=expDictType)) 0078 0079 # Reopen in correct encoding if not the default. 0080 if dictEnc.lower() != dictEncDefault.lower(): 0081 dictFile.close() 0082 dictFile=open(dictPath, "r", dictEnc) 0083 0084 # Read all words and eliminate duplicates. 0085 words=set() 0086 validCharacters=re.compile(r"^[\w\d\'・-]+$", re.UNICODE) 0087 lno = 0 0088 for word in dictFile: 0089 lno += 1 0090 word=word.strip() 0091 if not word or word.startswith("personal_ws"): 0092 continue 0093 if word in words: 0094 report(" " + _("@item:inlist", 0095 "duplicate removed: %(word)s", 0096 word=word)) 0097 elif not validCharacters.match(word): 0098 if not reminv: 0099 report(" " + _("@item:inlist", 0100 "*** invalid word at %(line)s: %(word)s", 0101 line=lno, word=word)) 0102 words.add(word) 0103 else: 0104 report(" " + _("@item:inlist", 0105 "invalid word removed: %(word)s", 0106 word=word)) 0107 else: 0108 words.add(word) 0109 dictFile.close() 0110 words=list(words) 0111 numWords=len(words) 0112 0113 # Sort the list according to current locale, ignoring case. 0114 words.sort(lambda x, y: locale.strcoll(x.lower(), y.lower())) 0115 0116 # Write back the updated dictionary. 0117 dictFile=open(dictPath, "w", dictEnc) 0118 dictFile.write("%s %s %d %s\n" % (dictType, dictLang, numWords, dictEnc)) 0119 dictFile.write("\n".join(words)) 0120 dictFile.write("\n") 0121 dictFile.close() 0122 report(" " + n_("@item:inlist", 0123 "written %(num)d word", 0124 "written %(num)d words", 0125 num=len(words))) 0126 0127 0128 def usage(): 0129 report(_("@info", 0130 "Usage: %(cmd)s [-r|--remove-invalid] DICTFILE...", 0131 cmd=basename(sys.argv[0]))) 0132 sys.exit(1) 0133 0134 0135 if __name__ == '__main__': 0136 main()