Warning, /sdk/pology/bin/normalize-aspell-word-list is written in an unsupported language. File is not indexed.
0001 #!/usr/bin/env python3
0002 # -*- coding: UTF-8 -*-
0003
0004 """
0005 Organize dictionary file:
0006 - sort entries
0007 - remove duplicate
0008 - update header
0009
0010 This script is intended to be run standalone.
0011
0012 Usage::
0013 python <dict file>
0014
0015 @author: Sébastien Renard <sebastien.renard@digitalfox.org>
0016 @license: GPLv3
0017 """
0018
0019 import locale
0020 from codecs import open
0021 from os.path import abspath, basename
0022 import re
0023 import sys
0024
0025 try:
0026 import fallback_import_paths
0027 except:
0028 pass
0029 from pology import _, n_
0030 from pology.report import report, error
0031
0032
0033 def main():
0034
0035 locale.setlocale(locale.LC_ALL, "")
0036
0037 # FIXME: Use pology.colors.ColorOptionParser.
0038 reminv=False
0039 paths=[]
0040 for arg in sys.argv[1:]:
0041 if arg.startswith("-"):
0042 if arg in ("-r", "--remove-invalid"):
0043 reminv = True
0044 else:
0045 error(_("@info",
0046 "Unknown option '%(opt)s' in command line.",
0047 opt=arg))
0048 else:
0049 paths.append(arg)
0050 if len(paths)<1:
0051 usage()
0052
0053 for path in paths:
0054 organize(path, reminv)
0055
0056
0057 def organize (dictPath, reminv=False):
0058
0059 report(dictPath)
0060 dictEncDefault = "UTF-8"
0061 dictFile=open(dictPath, "r", dictEncDefault)
0062
0063 # Parse the header for language and encoding.
0064 header=dictFile.readline()
0065 m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header)
0066 if not m:
0067 error(_("@info",
0068 "Malformed header of the dictionary file '%(file)s'.",
0069 file=dictPath))
0070 dictType, dictLang, numWords, dictEnc=m.groups()
0071
0072 expDictType = "personal_ws-1.1"
0073 if dictType != expDictType:
0074 dictType = expDictType
0075 report(" " + _("@item:inlist",
0076 "dictionary type changed to '%(dtype)s'",
0077 dtype=expDictType))
0078
0079 # Reopen in correct encoding if not the default.
0080 if dictEnc.lower() != dictEncDefault.lower():
0081 dictFile.close()
0082 dictFile=open(dictPath, "r", dictEnc)
0083
0084 # Read all words and eliminate duplicates.
0085 words=set()
0086 validCharacters=re.compile(r"^[\w\d\'・-]+$", re.UNICODE)
0087 lno = 0
0088 for word in dictFile:
0089 lno += 1
0090 word=word.strip()
0091 if not word or word.startswith("personal_ws"):
0092 continue
0093 if word in words:
0094 report(" " + _("@item:inlist",
0095 "duplicate removed: %(word)s",
0096 word=word))
0097 elif not validCharacters.match(word):
0098 if not reminv:
0099 report(" " + _("@item:inlist",
0100 "*** invalid word at %(line)s: %(word)s",
0101 line=lno, word=word))
0102 words.add(word)
0103 else:
0104 report(" " + _("@item:inlist",
0105 "invalid word removed: %(word)s",
0106 word=word))
0107 else:
0108 words.add(word)
0109 dictFile.close()
0110 words=list(words)
0111 numWords=len(words)
0112
0113 # Sort the list according to current locale, ignoring case.
0114 words.sort(lambda x, y: locale.strcoll(x.lower(), y.lower()))
0115
0116 # Write back the updated dictionary.
0117 dictFile=open(dictPath, "w", dictEnc)
0118 dictFile.write("%s %s %d %s\n" % (dictType, dictLang, numWords, dictEnc))
0119 dictFile.write("\n".join(words))
0120 dictFile.write("\n")
0121 dictFile.close()
0122 report(" " + n_("@item:inlist",
0123 "written %(num)d word",
0124 "written %(num)d words",
0125 num=len(words)))
0126
0127
0128 def usage():
0129 report(_("@info",
0130 "Usage: %(cmd)s [-r|--remove-invalid] DICTFILE...",
0131 cmd=basename(sys.argv[0])))
0132 sys.exit(1)
0133
0134
0135 if __name__ == '__main__':
0136 main()