File indexing completed on 2024-12-01 13:47:52

0001 #!/usr/bin/env python3
0002 # -*- coding: UTF-8 -*-
0003 
0004 """
0005 Organize dictionary file:
0006   - sort entries
0007   - remove duplicate
0008   - update header
0009 
0010 This script is intended to be run standalone.
0011 
0012 Usage::
0013     python <dict file>
0014 
0015 @author: Sébastien Renard <sebastien.renard@digitalfox.org>
0016 @license: GPLv3
0017 """
0018 
0019 import locale
0020 from codecs import open
0021 from os.path import abspath, basename
0022 import re
0023 import sys
0024 
0025 try:
0026     import fallback_import_paths
0027 except:
0028     pass
0029 from pology import _, n_
0030 from pology.report import report, error
0031 
0032 
0033 def main():
0034 
0035     locale.setlocale(locale.LC_ALL, "")
0036 
0037     # FIXME: Use pology.colors.ColorOptionParser.
0038     reminv=False
0039     paths=[]
0040     for arg in sys.argv[1:]:
0041         if arg.startswith("-"):
0042             if arg in ("-r", "--remove-invalid"):
0043                 reminv = True
0044             else:
0045                 error(_("@info",
0046                         "Unknown option '%(opt)s' in command line.",
0047                         opt=arg))
0048         else:
0049             paths.append(arg)
0050     if len(paths)<1:
0051         usage()
0052 
0053     for path in paths:
0054         organize(path, reminv)
0055 
0056 
0057 def organize (dictPath, reminv=False):
0058 
0059     report(dictPath)
0060     dictEncDefault = "UTF-8"
0061     dictFile=open(dictPath, "r", dictEncDefault)
0062 
0063     # Parse the header for language and encoding.
0064     header=dictFile.readline()
0065     m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header)
0066     if not m:
0067         error(_("@info",
0068                 "Malformed header of the dictionary file '%(file)s'.",
0069                 file=dictPath))
0070     dictType, dictLang, numWords, dictEnc=m.groups()
0071 
0072     expDictType = "personal_ws-1.1"
0073     if dictType != expDictType:
0074         dictType = expDictType
0075         report("  " + _("@item:inlist",
0076                         "dictionary type changed to '%(dtype)s'",
0077                         dtype=expDictType))
0078 
0079     # Reopen in correct encoding if not the default.
0080     if dictEnc.lower() != dictEncDefault.lower():
0081         dictFile.close()
0082         dictFile=open(dictPath, "r", dictEnc)
0083 
0084     # Read all words and eliminate duplicates.
0085     words=set()
0086     validCharacters=re.compile(r"^[\w\d\'・-]+$", re.UNICODE)
0087     lno = 0
0088     for word in dictFile:
0089         lno += 1
0090         word=word.strip()
0091         if not word or word.startswith("personal_ws"):
0092             continue
0093         if word in words:
0094             report("  " + _("@item:inlist",
0095                             "duplicate removed: %(word)s",
0096                             word=word))
0097         elif not validCharacters.match(word):
0098             if not reminv:
0099                 report("  " + _("@item:inlist",
0100                                 "*** invalid word at %(line)s: %(word)s",
0101                                 line=lno, word=word))
0102                 words.add(word)
0103             else:
0104                 report("  " + _("@item:inlist",
0105                                 "invalid word removed: %(word)s",
0106                                 word=word))
0107         else:
0108             words.add(word)
0109     dictFile.close()
0110     words=list(words)
0111     numWords=len(words)
0112 
0113     # Sort the list according to current locale, ignoring case.
0114     words.sort(lambda x, y: locale.strcoll(x.lower(), y.lower()))
0115 
0116     # Write back the updated dictionary.
0117     dictFile=open(dictPath, "w", dictEnc)
0118     dictFile.write("%s %s %d %s\n" % (dictType, dictLang, numWords, dictEnc))
0119     dictFile.write("\n".join(words))
0120     dictFile.write("\n")
0121     dictFile.close()
0122     report("  " + n_("@item:inlist",
0123                      "written %(num)d word",
0124                      "written %(num)d words",
0125                      num=len(words)))
0126 
0127 
0128 def usage():
0129     report(_("@info",
0130              "Usage: %(cmd)s [-r|--remove-invalid] DICTFILE...",
0131              cmd=basename(sys.argv[0])))
0132     sys.exit(1)
0133 
0134 
0135 if __name__ == '__main__':
0136     main()