File indexing completed on 2024-10-13 13:24:39
0001 #!/usr/bin/env python3 0002 # -*- coding: UTF-8 -*- 0003 0004 try: 0005 import fallback_import_paths 0006 except: 0007 pass 0008 0009 import sys 0010 import os 0011 import re 0012 import locale 0013 0014 from pology import PologyError, version, _, n_ 0015 from pology.lang.sr.wconv import ctol, hictoall 0016 from pology.lang.sr.trapnakron import rootdir 0017 from pology.lang.sr.trapnakron import trapnakron_ui 0018 from pology.lang.sr.trapnakron import norm_pkey, norm_rtkey 0019 from pology.lang.sr.trapnakron import _disamb_marker 0020 from pology.colors import ColorOptionParser 0021 from pology.fsops import str_to_unicode 0022 from pology.normalize import identify 0023 from pology.report import report, warning, format_item_list 0024 from pology.vcs import VcsSubversion 0025 0026 0027 def validate (tp, onlysrcs=None, onlykeys=None, demoexp=False, expwkeys=False): 0028 0029 needed_pkeys = set() 0030 0031 nom_pkeys = ( 0032 ["н"], 0033 ["нм", "нж", "нс", "ну"], 0034 ) 0035 needed_pkeys.update(sum(nom_pkeys, [])) 0036 0037 gender_pkey = "_род" 0038 needed_pkeys.add(gender_pkey) 0039 0040 known_genders = set(("м", "ж", "с", "у")) 0041 known_genders.update(list(map(ctol, known_genders))) 0042 0043 known_alts = [ 0044 ("_s", "сист"), 0045 ("_a", "алт"), 0046 ("_a2", "алт2"), 0047 ("_a3", "алт3"), 0048 ] 0049 base_envs = ["", "л", "иј", "ијл"] 0050 all_envs = set(base_envs) 0051 for aenv in [x[1] for x in known_alts]: 0052 all_envs.update(x + aenv for x in base_envs) 0053 0054 if demoexp: 0055 demoexp_pkeys = ["н", "г", "д", "а", "в", "и", 0056 "нк", "гк", "дк", "ак", "вк", 0057 "нм", "нмп"] 0058 needed_pkeys.update(demoexp_pkeys) 0059 0060 dkeys_by_rtkey = {} 0061 0062 # Sort keys such that derivations are checked by file and position. 0063 dkeys = tp.dkeys(single=onlykeys is None) 0064 def sortkey (x): 0065 path, lno, cno = tp.source_pos(x) 0066 return path.count(os.path.sep), path, lno, cno 0067 dkeys = sorted(dkeys, key=sortkey) 0068 0069 nproblems = 0 0070 unmatched_srcs = set(onlysrcs) if onlysrcs is not None else None 0071 unmatched_keys = set(onlykeys) if onlykeys is not None else None 0072 reported_fmtexps = set() 0073 0074 for dkey in dkeys: 0075 srcname = tp.source_name(dkey) 0076 path, lno, cno = tp.source_pos(dkey) 0077 cnproblems = 0 0078 0079 if ( ( onlysrcs is not None 0080 and not _match_text(srcname, onlysrcs, unmatched_srcs)) 0081 or ( onlykeys is not None 0082 and not _match_text(dkey, onlykeys, unmatched_keys)) 0083 ): 0084 continue 0085 0086 try: 0087 aprops = [] 0088 seenesuffs = set() 0089 cenvs = tp.envs(dkey) 0090 for cenv in cenvs: 0091 if cenv != "": 0092 envmatched = False 0093 for ksuff, esuff in known_alts: 0094 if cenv in all_envs and cenv.endswith(esuff): 0095 envmatched = True 0096 break 0097 else: 0098 envmatched = True 0099 ksuff, esuff = "", "" 0100 if envmatched and esuff not in seenesuffs: 0101 dkeym = dkey + ksuff 0102 props = dict([(x, tp.get2(dkeym, norm_pkey(x))) 0103 for x in needed_pkeys]) 0104 aprops.append((esuff, props)) 0105 seenesuffs.add(esuff) 0106 elif cenv not in all_envs: 0107 warning(_("@info", 0108 "Derivation at %(file)s:%(line)d:%(col)d " 0109 "defines unknown environment '%(env)s'.", 0110 file=path, line=lno, col=cno, env=cenv)) 0111 cnproblems += 1 0112 except Exception as e: 0113 warning(str_to_unicode(str(e))) 0114 cnproblems += 1 0115 continue 0116 0117 for esuff, props in aprops: 0118 # Assure all nominative forms are unique. 0119 for pkeys in nom_pkeys: # select first nominative set by priority 0120 pvals = [props.get(x) for x in pkeys] 0121 noms = [x for x in pvals if x is not None] 0122 if noms: 0123 break 0124 if noms: 0125 rtkeys = list(map(norm_rtkey, noms)) 0126 for rtkey in rtkeys: 0127 odkey = dkeys_by_rtkey.get(rtkey) 0128 if odkey is not None and tp.props(dkey) != tp.props(odkey): 0129 opath, olno, ocno = tp.source_pos(odkey) 0130 warning(_("@info", 0131 "Derivation at %(file1)s:%(line1)d:%(col1)d " 0132 "has normalized nominative equal to " 0133 "derivation at %(file2)s:%(line2)d:%(col2)d; " 0134 "consider adding a disambiguation marker " 0135 "(%(dchar)s).", 0136 file1=path, line1=lno, col1=cno, 0137 file2=opath, line2=olno, col2=ocno, 0138 dchar=_disamb_marker)) 0139 cnproblems += 1 0140 for rtkey in rtkeys: # must be in new loop 0141 dkeys_by_rtkey[rtkey] = dkey 0142 0143 # Assure presence of gender on noun derivations. 0144 if props.get(nom_pkeys[0][0]) is not None: 0145 gender = props.get(gender_pkey) 0146 if gender is None: 0147 warning(_("@info", 0148 "Derivation at %(file)s:%(line)d:%(col)d " 0149 "does not define gender.", 0150 file=path, line=lno, col=cno)) 0151 cnproblems += 1 0152 else: 0153 for gender in hictoall(gender): 0154 if gender not in known_genders: 0155 warning(_("@info", 0156 "Derivation at %(file)s:%(line)d:%(col)d " 0157 "defines unknown gender '%(gen)s'.", 0158 file=path, line=lno, col=cno, gen=gender)) 0159 cnproblems += 1 0160 0161 # Show selection of expanded properties if requested. 0162 if demoexp and not cnproblems: 0163 demoprops = [(x, props.get(x)) for x in demoexp_pkeys] 0164 demoprops = [x for x in demoprops if x[1] is not None] 0165 fmtprops = ["%s=%s" % (x[0], _escape_pval(x[1])) 0166 for x in demoprops] 0167 fmtsyns = ["%s" % _escape_syn(x) for x in tp.syns(dkey)] 0168 fmtexp = ", ".join(fmtsyns) + ": " + ", ".join(fmtprops) 0169 if expwkeys: 0170 fmtdkeys = ", ".join(sorted(tp.altdkeys(dkey))) 0171 fmtexp = "# " + fmtdkeys + "\n" + fmtexp 0172 if fmtexp not in reported_fmtexps: 0173 if not esuff: 0174 report(fmtexp) 0175 reported_fmtexps.add(fmtexp) 0176 else: 0177 afmtexp = " @" + esuff + ": " + ", ".join(fmtprops) 0178 report(afmtexp) 0179 0180 nproblems += cnproblems 0181 tp.empty_pcache() 0182 0183 if unmatched_srcs: 0184 fmtsrcs = format_item_list(sorted(getattr(x, "pattern", x) 0185 for x in unmatched_srcs)) 0186 warning(_("@info", 0187 "Sources requested by name not found: %(srclist)s.", 0188 srclist=fmtsrcs)) 0189 if unmatched_keys: 0190 fmtkeys = format_item_list(sorted(getattr(x, "pattern", x) 0191 for x in unmatched_keys)) 0192 warning(_("@info", 0193 "Derivations requested by key not found: %(keylist)s.", 0194 keylist=fmtkeys)) 0195 0196 return nproblems 0197 0198 0199 class _Wre (object): 0200 0201 def __init__ (self, pattern): 0202 0203 self.regex = re.compile(pattern, re.U) 0204 self.pattern = pattern 0205 0206 0207 def _match_text (text, tests, unmatched_tests=None): 0208 0209 match = False 0210 for test in tests: 0211 if isinstance(test, str): 0212 if test == text: 0213 match = True 0214 break 0215 elif isinstance(test, _Wre): 0216 if test.regex.search(text): 0217 match = True 0218 break 0219 elif callable(test): 0220 if test(text): 0221 match = True 0222 break 0223 else: 0224 raise PologyError( 0225 _("@info", 0226 "Unknown matcher type '%(type)s'.", 0227 type=type(test))) 0228 0229 if unmatched_tests is not None: 0230 if match and test in unmatched_tests: 0231 unmatched_tests.remove(test) 0232 0233 return match 0234 0235 0236 def _escape_pval (pval): 0237 0238 pval = pval.replace(",", "\,") 0239 return pval 0240 0241 0242 def _escape_syn (pval): 0243 0244 pval = pval.replace(",", "\,") 0245 pval = pval.replace(":", "\:") 0246 return pval 0247 0248 0249 def _collect_mod_dkeys (tp, onlysrcs=None, onlykeys=None): 0250 0251 # Collect the unified diff of trapnakron root. 0252 vcs = VcsSubversion() 0253 udiff = vcs.diff(rootdir()) 0254 udiff = _elim_moved_blocks(udiff) 0255 0256 # Collect key syntagmas related to added lines. 0257 asyns = set() 0258 skip_file = True 0259 prev_syns = None 0260 for tag, data in udiff: 0261 if tag == "@": 0262 continue 0263 0264 fpath = data 0265 if tag == ":": 0266 if not fpath.endswith(".sd"): 0267 skip_file = True 0268 else: 0269 srcname = os.path.splitext(os.path.basename(fpath))[0] 0270 if onlysrcs is None: 0271 skip_file = False 0272 else: 0273 skip_file = not _match_text(srcname, onlysrcs) 0274 if skip_file: 0275 continue 0276 0277 line = data.strip() 0278 if line.startswith(("#", ">")) or not line: 0279 continue 0280 if tag == " ": 0281 if not line.startswith("@"): 0282 prev_syns = _parse_syns(line) 0283 elif tag == "+": 0284 if not line.startswith("@"): 0285 syns = _parse_syns(line) 0286 elif prev_syns: 0287 syns = prev_syns 0288 asyns.update(syns) 0289 prev_syns = [] 0290 0291 # Collect derivation keys from syntagmas. 0292 onlykeys_mod = set() 0293 dkeys_in_tp = set(tp.dkeys(single=True)) 0294 for syn in asyns: 0295 dkey = identify(syn) 0296 if ( dkey and dkey in dkeys_in_tp 0297 and (onlykeys is None or _match_text(dkey, onlykeys)) 0298 ): 0299 onlykeys_mod.add(dkey) 0300 0301 return None, onlykeys_mod 0302 0303 0304 # Eliminate difference blocks due to pure moving between and within files. 0305 def _elim_moved_blocks (udiff): 0306 0307 segcnt_ad = {} 0308 segcnt_rm = {} 0309 ctag = "" 0310 cseg = [] 0311 for tag, data in udiff + [("@", None)]: # sentry 0312 if tag == "@": 0313 if ctag in ("+", "-"): 0314 cskey = "".join(cseg) 0315 segcnt = segcnt_ad if ctag == "+" else segcnt_rm 0316 if cskey not in segcnt: 0317 segcnt[cskey] = 0 0318 segcnt[cskey] += 1 0319 ctag = "" 0320 cseg = [] 0321 elif tag in ("+", "-"): 0322 if ctag and ctag != tag: 0323 ctag = "xxx" 0324 else: 0325 ctag = tag 0326 cseg.append(data) 0327 0328 udiff_mod = [] 0329 subdiff = [] 0330 ctag = "" 0331 cseg = [] 0332 for tag, data in udiff + [("@", None)]: 0333 if tag in (":", "@"): 0334 if subdiff: 0335 cskey = "".join(cseg) 0336 if ( ctag not in ("+", "-") 0337 or segcnt_ad.get(cskey, 0) != 1 0338 or segcnt_rm.get(cskey, 0) != 1 0339 ): 0340 udiff_mod.extend(subdiff) 0341 subdiff = [] 0342 cseg = [] 0343 ctag = "" 0344 if tag == ":": 0345 udiff_mod.append((tag, data)) 0346 else: 0347 subdiff = [(tag, data)] 0348 else: 0349 subdiff.append((tag, data)) 0350 if tag in ("+", "-"): 0351 if ctag and ctag != tag: 0352 ctag = "xxx" 0353 else: 0354 ctag = tag 0355 cseg.append(data) 0356 0357 return udiff_mod 0358 0359 0360 def _parse_syns (line): 0361 0362 if line.strip().startswith(("#", ">")): 0363 return [] 0364 0365 llen = len(line) 0366 pos = 0 0367 syns = [] 0368 csyn = "" 0369 intag = False 0370 while pos < llen: 0371 c = line[pos] 0372 if c == "\\": 0373 pos += 1 0374 if pos < llen: 0375 csyn += line[pos] 0376 elif intag: 0377 if cltag: 0378 if c == cltag: 0379 intag = False 0380 else: 0381 cn = line[pos + 1:pos + 2] 0382 if cn in (",", ":") or cn.isspace(): 0383 intag = False 0384 elif c == "~": 0385 intag = True 0386 cltag = "}" if line[pos + 1:pos + 2] == "{" else "" 0387 elif c in (",", ":"): 0388 csyn = csyn.strip() 0389 if csyn.startswith("|"): 0390 csyn = csyn[1:] 0391 syns.append(csyn) 0392 if c == ":": 0393 break 0394 else: 0395 csyn = "" 0396 spos = pos + 1 0397 else: 0398 csyn += line[pos] 0399 pos += 1 0400 0401 return syns 0402 0403 0404 def _statistics (tp, onlysrcs, onlykeys): 0405 0406 dkeys = set() 0407 fpaths = {} 0408 for dkey in tp.dkeys(single=True): 0409 srcname = tp.source_name(dkey) 0410 fpath, lno, cno = tp.source_pos(dkey) 0411 0412 if ( (onlysrcs is not None and not _match_text(srcname, onlysrcs)) 0413 or (onlykeys is not None and not _match_text(dkey, onlykeys)) 0414 ): 0415 continue 0416 0417 dkeys.add(dkey) 0418 if fpath not in fpaths: 0419 fpaths[fpath] = [srcname, 0] 0420 fpaths[fpath][1] += 1 0421 0422 report("-" * 40) 0423 if onlysrcs is not None or onlykeys is not None: 0424 report(_("@info statistics; side note stating that not all entries " 0425 "have been taken into account, but only some selected", 0426 "(Selection active.)")) 0427 report(_("@info statistics", 0428 "Total derivations: %(num)d", 0429 num=len(dkeys))) 0430 if len(fpaths) > 0: 0431 report(_("@info statistics", 0432 "Total files: %(num)d", 0433 num=len(fpaths))) 0434 report(_("@info statistics", 0435 "Average derivations per file: %(num).1f", 0436 num=(float(len(dkeys)) / len(fpaths)))) 0437 bydif = sorted([(v[1], v[0]) for k, v in list(fpaths.items())]) 0438 report(_("@info statistics", 0439 "Most derivations in a file: %(num)d (%(file)s)", 0440 num=bydif[-1][0], file=bydif[-1][1])) 0441 0442 0443 def _main (): 0444 0445 locale.setlocale(locale.LC_ALL, "") 0446 0447 usage= _("@info command usage", 0448 "%(cmd)s [OPTIONS] [DKEY|SRCPATH|:SRCNAME]...", 0449 cmd="%prog") 0450 desc = _("@info command description", 0451 "Check validity and expand derivations from internal trapnakron.") 0452 ver = _("@info command version", 0453 "%(cmd)s (Pology) %(version)s\n" 0454 "Copyright © 2009, 2010 " 0455 "Chusslove Illich (Часлав Илић) <%(email)s>", 0456 cmd="%prog", version=version(), email="caslav.ilic@gmx.net") 0457 0458 opars = ColorOptionParser(usage=usage, description=desc, version=ver) 0459 opars.add_option( 0460 "-e", "--expansion-sample", 0461 action="store_true", dest="demoexp", default=False, 0462 help=_("@info command line option description", 0463 "Show a sample of expanded properties for " 0464 "each valid derivation.")) 0465 opars.add_option( 0466 "-k", "--show-keys", 0467 action="store_true", dest="expwkeys", default=False, 0468 help=_("@info command line option description", 0469 "When expanding, also show all derivation keys by derivation.")) 0470 opars.add_option( 0471 "-m", "--modified", 0472 action="store_true", dest="modified", default=False, 0473 help=_("@info command line option description", 0474 "Validate or expand only modified derivations.")) 0475 opars.add_option( 0476 "-r", "--regex", 0477 action="store_true", dest="regex", default=False, 0478 help=_("@info command line option description", 0479 "Source names and derivation keys given in command line " 0480 "are regular expressions.")) 0481 opars.add_option( 0482 "-s", "--statistics", 0483 action="store_true", dest="statistics", default=False, 0484 help=_("@info command line option description", 0485 "Show statistics.")) 0486 0487 (options, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) 0488 0489 try: 0490 import psyco 0491 psyco.full() 0492 except ImportError: 0493 pass 0494 0495 onlysrcs = set() 0496 onlykeys = set() 0497 sksep = ":" 0498 for arg in free_args: 0499 if os.path.isfile(arg): 0500 test = os.path.splitext(arg.split(os.path.sep)[-1])[0] 0501 onlysrcs.add(test) 0502 elif arg.startswith(sksep): 0503 test = arg[len(sksep):] 0504 if options.regex: 0505 test = _Wre(test) 0506 onlysrcs.add(test) 0507 else: 0508 if options.regex: 0509 arg = _Wre(arg) 0510 else: 0511 arg = identify(arg) 0512 onlykeys.add(arg) 0513 0514 onlysrcs = onlysrcs or None 0515 onlykeys = onlykeys or None 0516 0517 # Create and validate the trapnakron. 0518 tp = trapnakron_ui() 0519 if options.modified: 0520 onlysrcs, onlykeys = _collect_mod_dkeys(tp, onlysrcs, onlykeys) 0521 validate(tp, onlysrcs, onlykeys, options.demoexp, options.expwkeys) 0522 0523 if options.statistics: 0524 _statistics(tp, onlysrcs, onlykeys) 0525 0526 0527 if __name__ == '__main__': 0528 _main() 0529