File indexing completed on 2024-10-13 13:24:39

0001 #!/usr/bin/env python3
0002 # -*- coding: UTF-8 -*-
0003 
0004 try:
0005     import fallback_import_paths
0006 except:
0007     pass
0008 
0009 import sys
0010 import os
0011 import re
0012 import locale
0013 
0014 from pology import PologyError, version, _, n_
0015 from pology.lang.sr.wconv import ctol, hictoall
0016 from pology.lang.sr.trapnakron import rootdir
0017 from pology.lang.sr.trapnakron import trapnakron_ui
0018 from pology.lang.sr.trapnakron import norm_pkey, norm_rtkey
0019 from pology.lang.sr.trapnakron import _disamb_marker
0020 from pology.colors import ColorOptionParser
0021 from pology.fsops import str_to_unicode
0022 from pology.normalize import identify
0023 from pology.report import report, warning, format_item_list
0024 from pology.vcs import VcsSubversion
0025 
0026 
0027 def validate (tp, onlysrcs=None, onlykeys=None, demoexp=False, expwkeys=False):
0028 
0029     needed_pkeys = set()
0030 
0031     nom_pkeys = (
0032         ["н"],
0033         ["нм", "нж", "нс", "ну"],
0034     )
0035     needed_pkeys.update(sum(nom_pkeys, []))
0036 
0037     gender_pkey = "_род"
0038     needed_pkeys.add(gender_pkey)
0039 
0040     known_genders = set(("м", "ж", "с", "у"))
0041     known_genders.update(list(map(ctol, known_genders)))
0042 
0043     known_alts = [
0044         ("_s", "сист"),
0045         ("_a", "алт"),
0046         ("_a2", "алт2"),
0047         ("_a3", "алт3"),
0048     ]
0049     base_envs = ["", "л", "иј", "ијл"]
0050     all_envs = set(base_envs)
0051     for aenv in [x[1] for x in known_alts]:
0052         all_envs.update(x + aenv for x in base_envs)
0053 
0054     if demoexp:
0055         demoexp_pkeys = ["н", "г", "д", "а", "в", "и",
0056                          "нк", "гк", "дк", "ак", "вк",
0057                          "нм", "нмп"]
0058         needed_pkeys.update(demoexp_pkeys)
0059 
0060     dkeys_by_rtkey = {}
0061 
0062     # Sort keys such that derivations are checked by file and position.
0063     dkeys = tp.dkeys(single=onlykeys is None)
0064     def sortkey (x):
0065         path, lno, cno = tp.source_pos(x)
0066         return path.count(os.path.sep), path, lno, cno
0067     dkeys = sorted(dkeys, key=sortkey)
0068 
0069     nproblems = 0
0070     unmatched_srcs = set(onlysrcs) if onlysrcs is not None else None
0071     unmatched_keys = set(onlykeys) if onlykeys is not None else None
0072     reported_fmtexps = set()
0073 
0074     for dkey in dkeys:
0075         srcname = tp.source_name(dkey)
0076         path, lno, cno = tp.source_pos(dkey)
0077         cnproblems = 0
0078 
0079         if (   (    onlysrcs is not None
0080                 and not _match_text(srcname, onlysrcs, unmatched_srcs))
0081             or (    onlykeys is not None
0082                 and not _match_text(dkey, onlykeys, unmatched_keys))
0083         ):
0084             continue
0085 
0086         try:
0087             aprops = []
0088             seenesuffs = set()
0089             cenvs = tp.envs(dkey)
0090             for cenv in cenvs:
0091                 if cenv != "":
0092                     envmatched = False
0093                     for ksuff, esuff in known_alts:
0094                         if cenv in all_envs and cenv.endswith(esuff):
0095                             envmatched = True
0096                             break
0097                 else:
0098                     envmatched = True
0099                     ksuff, esuff = "", ""
0100                 if envmatched and esuff not in seenesuffs:
0101                     dkeym = dkey + ksuff
0102                     props = dict([(x, tp.get2(dkeym, norm_pkey(x)))
0103                                    for x in needed_pkeys])
0104                     aprops.append((esuff, props))
0105                     seenesuffs.add(esuff)
0106                 elif cenv not in all_envs:
0107                     warning(_("@info",
0108                               "Derivation at %(file)s:%(line)d:%(col)d "
0109                               "defines unknown environment '%(env)s'.",
0110                               file=path, line=lno, col=cno, env=cenv))
0111                     cnproblems += 1
0112         except Exception as e:
0113             warning(str_to_unicode(str(e)))
0114             cnproblems += 1
0115             continue
0116 
0117         for esuff, props in aprops:
0118             # Assure all nominative forms are unique.
0119             for pkeys in nom_pkeys: # select first nominative set by priority
0120                 pvals = [props.get(x) for x in pkeys]
0121                 noms = [x for x in pvals if x is not None]
0122                 if noms:
0123                     break
0124             if noms:
0125                 rtkeys = list(map(norm_rtkey, noms))
0126                 for rtkey in rtkeys:
0127                     odkey = dkeys_by_rtkey.get(rtkey)
0128                     if odkey is not None and tp.props(dkey) != tp.props(odkey):
0129                         opath, olno, ocno = tp.source_pos(odkey)
0130                         warning(_("@info",
0131                                   "Derivation at %(file1)s:%(line1)d:%(col1)d "
0132                                   "has normalized nominative equal to "
0133                                   "derivation at %(file2)s:%(line2)d:%(col2)d; "
0134                                   "consider adding a disambiguation marker "
0135                                   "(%(dchar)s).",
0136                                   file1=path, line1=lno, col1=cno,
0137                                   file2=opath, line2=olno, col2=ocno,
0138                                   dchar=_disamb_marker))
0139                         cnproblems += 1
0140                 for rtkey in rtkeys: # must be in new loop
0141                     dkeys_by_rtkey[rtkey] = dkey
0142 
0143             # Assure presence of gender on noun derivations.
0144             if props.get(nom_pkeys[0][0]) is not None:
0145                 gender = props.get(gender_pkey)
0146                 if gender is None:
0147                     warning(_("@info",
0148                               "Derivation at %(file)s:%(line)d:%(col)d "
0149                               "does not define gender.",
0150                               file=path, line=lno, col=cno))
0151                     cnproblems += 1
0152                 else:
0153                     for gender in hictoall(gender):
0154                         if gender not in known_genders:
0155                             warning(_("@info",
0156                                       "Derivation at %(file)s:%(line)d:%(col)d "
0157                                       "defines unknown gender '%(gen)s'.",
0158                                       file=path, line=lno, col=cno, gen=gender))
0159                             cnproblems += 1
0160 
0161             # Show selection of expanded properties if requested.
0162             if demoexp and not cnproblems:
0163                 demoprops = [(x, props.get(x)) for x in demoexp_pkeys]
0164                 demoprops = [x for x in demoprops if x[1] is not None]
0165                 fmtprops = ["%s=%s" % (x[0], _escape_pval(x[1]))
0166                             for x in demoprops]
0167                 fmtsyns = ["%s" % _escape_syn(x) for x in tp.syns(dkey)]
0168                 fmtexp = ", ".join(fmtsyns) + ": " + ", ".join(fmtprops)
0169                 if expwkeys:
0170                     fmtdkeys = ", ".join(sorted(tp.altdkeys(dkey)))
0171                     fmtexp = "# " + fmtdkeys + "\n" + fmtexp
0172                 if fmtexp not in reported_fmtexps:
0173                     if not esuff:
0174                         report(fmtexp)
0175                         reported_fmtexps.add(fmtexp)
0176                     else:
0177                         afmtexp = "    @" + esuff + ": " + ", ".join(fmtprops)
0178                         report(afmtexp)
0179 
0180         nproblems += cnproblems
0181         tp.empty_pcache()
0182 
0183     if unmatched_srcs:
0184         fmtsrcs = format_item_list(sorted(getattr(x, "pattern", x)
0185                                           for x in unmatched_srcs))
0186         warning(_("@info",
0187                   "Sources requested by name not found: %(srclist)s.",
0188                   srclist=fmtsrcs))
0189     if unmatched_keys:
0190         fmtkeys = format_item_list(sorted(getattr(x, "pattern", x)
0191                                           for x in unmatched_keys))
0192         warning(_("@info",
0193                   "Derivations requested by key not found: %(keylist)s.",
0194                   keylist=fmtkeys))
0195 
0196     return nproblems
0197 
0198 
0199 class _Wre (object):
0200 
0201     def __init__ (self, pattern):
0202 
0203         self.regex = re.compile(pattern, re.U)
0204         self.pattern = pattern
0205 
0206 
0207 def _match_text (text, tests, unmatched_tests=None):
0208 
0209     match = False
0210     for test in tests:
0211         if isinstance(test, str):
0212             if test == text:
0213                 match = True
0214                 break
0215         elif isinstance(test, _Wre):
0216             if test.regex.search(text):
0217                 match = True
0218                 break
0219         elif callable(test):
0220             if test(text):
0221                 match = True
0222                 break
0223         else:
0224             raise PologyError(
0225                 _("@info",
0226                   "Unknown matcher type '%(type)s'.",
0227                   type=type(test)))
0228 
0229     if unmatched_tests is not None:
0230         if match and test in unmatched_tests:
0231             unmatched_tests.remove(test)
0232 
0233     return match
0234 
0235 
0236 def _escape_pval (pval):
0237 
0238     pval = pval.replace(",", "\,")
0239     return pval
0240 
0241 
0242 def _escape_syn (pval):
0243 
0244     pval = pval.replace(",", "\,")
0245     pval = pval.replace(":", "\:")
0246     return pval
0247 
0248 
0249 def _collect_mod_dkeys (tp, onlysrcs=None, onlykeys=None):
0250 
0251     # Collect the unified diff of trapnakron root.
0252     vcs = VcsSubversion()
0253     udiff = vcs.diff(rootdir())
0254     udiff = _elim_moved_blocks(udiff)
0255 
0256     # Collect key syntagmas related to added lines.
0257     asyns = set()
0258     skip_file = True
0259     prev_syns = None
0260     for tag, data in udiff:
0261         if tag == "@":
0262             continue
0263 
0264         fpath = data
0265         if tag == ":":
0266             if not fpath.endswith(".sd"):
0267                 skip_file = True
0268             else:
0269                 srcname = os.path.splitext(os.path.basename(fpath))[0]
0270                 if onlysrcs is None:
0271                     skip_file = False
0272                 else:
0273                     skip_file = not _match_text(srcname, onlysrcs)
0274         if skip_file:
0275             continue
0276 
0277         line = data.strip()
0278         if line.startswith(("#", ">")) or not line:
0279             continue
0280         if tag == " ":
0281             if not line.startswith("@"):
0282                 prev_syns = _parse_syns(line)
0283         elif tag == "+":
0284             if not line.startswith("@"):
0285                 syns = _parse_syns(line)
0286             elif prev_syns:
0287                 syns = prev_syns
0288             asyns.update(syns)
0289             prev_syns = []
0290 
0291     # Collect derivation keys from syntagmas.
0292     onlykeys_mod = set()
0293     dkeys_in_tp = set(tp.dkeys(single=True))
0294     for syn in asyns:
0295         dkey = identify(syn)
0296         if (    dkey and dkey in dkeys_in_tp
0297             and (onlykeys is None or _match_text(dkey, onlykeys))
0298         ):
0299             onlykeys_mod.add(dkey)
0300 
0301     return None, onlykeys_mod
0302 
0303 
0304 # Eliminate difference blocks due to pure moving between and within files.
0305 def _elim_moved_blocks (udiff):
0306 
0307     segcnt_ad = {}
0308     segcnt_rm = {}
0309     ctag = ""
0310     cseg = []
0311     for tag, data in udiff + [("@", None)]: # sentry
0312         if tag == "@":
0313             if ctag in ("+", "-"):
0314                 cskey = "".join(cseg)
0315                 segcnt = segcnt_ad if ctag == "+" else segcnt_rm
0316                 if cskey not in segcnt:
0317                     segcnt[cskey] = 0
0318                 segcnt[cskey] += 1
0319             ctag = ""
0320             cseg = []
0321         elif tag in ("+", "-"):
0322             if ctag and ctag != tag:
0323                 ctag = "xxx"
0324             else:
0325                 ctag = tag
0326                 cseg.append(data)
0327 
0328     udiff_mod = []
0329     subdiff = []
0330     ctag = ""
0331     cseg = []
0332     for tag, data in udiff + [("@", None)]:
0333         if tag in (":", "@"):
0334             if subdiff:
0335                 cskey = "".join(cseg)
0336                 if (   ctag not in ("+", "-")
0337                     or segcnt_ad.get(cskey, 0) != 1
0338                     or segcnt_rm.get(cskey, 0) != 1
0339                 ):
0340                     udiff_mod.extend(subdiff)
0341             subdiff = []
0342             cseg = []
0343             ctag = ""
0344             if tag == ":":
0345                 udiff_mod.append((tag, data))
0346             else:
0347                 subdiff = [(tag, data)]
0348         else:
0349             subdiff.append((tag, data))
0350             if tag in ("+", "-"):
0351                 if ctag and ctag != tag:
0352                     ctag = "xxx"
0353                 else:
0354                     ctag = tag
0355                     cseg.append(data)
0356 
0357     return udiff_mod
0358 
0359 
0360 def _parse_syns (line):
0361 
0362     if line.strip().startswith(("#", ">")):
0363         return []
0364 
0365     llen = len(line)
0366     pos = 0
0367     syns = []
0368     csyn = ""
0369     intag = False
0370     while pos < llen:
0371         c = line[pos]
0372         if c == "\\":
0373             pos += 1
0374             if pos < llen:
0375                 csyn += line[pos]
0376         elif intag:
0377             if cltag:
0378                 if c == cltag:
0379                     intag = False
0380             else:
0381                 cn = line[pos + 1:pos + 2]
0382                 if cn in (",", ":") or cn.isspace():
0383                     intag = False
0384         elif c == "~":
0385             intag = True
0386             cltag = "}" if line[pos + 1:pos + 2] == "{" else ""
0387         elif c in (",", ":"):
0388             csyn = csyn.strip()
0389             if csyn.startswith("|"):
0390                 csyn = csyn[1:]
0391             syns.append(csyn)
0392             if c == ":":
0393                 break
0394             else:
0395                 csyn = ""
0396                 spos = pos + 1
0397         else:
0398             csyn += line[pos]
0399         pos += 1
0400 
0401     return syns
0402 
0403 
0404 def _statistics (tp, onlysrcs, onlykeys):
0405 
0406     dkeys = set()
0407     fpaths = {}
0408     for dkey in tp.dkeys(single=True):
0409         srcname = tp.source_name(dkey)
0410         fpath, lno, cno = tp.source_pos(dkey)
0411 
0412         if (   (onlysrcs is not None and not _match_text(srcname, onlysrcs))
0413             or (onlykeys is not None and not _match_text(dkey, onlykeys))
0414         ):
0415             continue
0416 
0417         dkeys.add(dkey)
0418         if fpath not in fpaths:
0419             fpaths[fpath] = [srcname, 0]
0420         fpaths[fpath][1] += 1
0421 
0422     report("-" * 40)
0423     if onlysrcs is not None or onlykeys is not None:
0424         report(_("@info statistics; side note stating that not all entries "
0425                  "have been taken into account, but only some selected",
0426                  "(Selection active.)"))
0427     report(_("@info statistics",
0428              "Total derivations: %(num)d",
0429              num=len(dkeys)))
0430     if len(fpaths) > 0:
0431         report(_("@info statistics",
0432                  "Total files: %(num)d",
0433                  num=len(fpaths)))
0434         report(_("@info statistics",
0435                  "Average derivations per file: %(num).1f",
0436                  num=(float(len(dkeys)) / len(fpaths))))
0437         bydif = sorted([(v[1], v[0]) for k, v in list(fpaths.items())])
0438         report(_("@info statistics",
0439                  "Most derivations in a file: %(num)d (%(file)s)",
0440                  num=bydif[-1][0], file=bydif[-1][1]))
0441 
0442 
0443 def _main ():
0444 
0445     locale.setlocale(locale.LC_ALL, "")
0446 
0447     usage= _("@info command usage",
0448         "%(cmd)s [OPTIONS] [DKEY|SRCPATH|:SRCNAME]...",
0449         cmd="%prog")
0450     desc = _("@info command description",
0451         "Check validity and expand derivations from internal trapnakron.")
0452     ver = _("@info command version",
0453         "%(cmd)s (Pology) %(version)s\n"
0454         "Copyright © 2009, 2010 "
0455         "Chusslove Illich (Часлав Илић) &lt;%(email)s&gt;",
0456         cmd="%prog", version=version(), email="caslav.ilic@gmx.net")
0457 
0458     opars = ColorOptionParser(usage=usage, description=desc, version=ver)
0459     opars.add_option(
0460         "-e", "--expansion-sample",
0461         action="store_true", dest="demoexp", default=False,
0462         help=_("@info command line option description",
0463                "Show a sample of expanded properties for "
0464                "each valid derivation."))
0465     opars.add_option(
0466         "-k", "--show-keys",
0467         action="store_true", dest="expwkeys", default=False,
0468         help=_("@info command line option description",
0469                "When expanding, also show all derivation keys by derivation."))
0470     opars.add_option(
0471         "-m", "--modified",
0472         action="store_true", dest="modified", default=False,
0473         help=_("@info command line option description",
0474                "Validate or expand only modified derivations."))
0475     opars.add_option(
0476         "-r", "--regex",
0477         action="store_true", dest="regex", default=False,
0478         help=_("@info command line option description",
0479                "Source names and derivation keys given in command line "
0480                "are regular expressions."))
0481     opars.add_option(
0482         "-s", "--statistics",
0483         action="store_true", dest="statistics", default=False,
0484         help=_("@info command line option description",
0485                "Show statistics."))
0486 
0487     (options, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:]))
0488 
0489     try:
0490         import psyco
0491         psyco.full()
0492     except ImportError:
0493         pass
0494 
0495     onlysrcs = set()
0496     onlykeys = set()
0497     sksep = ":"
0498     for arg in free_args:
0499         if os.path.isfile(arg):
0500             test = os.path.splitext(arg.split(os.path.sep)[-1])[0]
0501             onlysrcs.add(test)
0502         elif arg.startswith(sksep):
0503             test = arg[len(sksep):]
0504             if options.regex:
0505                 test = _Wre(test)
0506             onlysrcs.add(test)
0507         else:
0508             if options.regex:
0509                 arg = _Wre(arg)
0510             else:
0511                 arg = identify(arg)
0512             onlykeys.add(arg)
0513 
0514     onlysrcs = onlysrcs or None
0515     onlykeys = onlykeys or None
0516 
0517     # Create and validate the trapnakron.
0518     tp = trapnakron_ui()
0519     if options.modified:
0520         onlysrcs, onlykeys = _collect_mod_dkeys(tp, onlysrcs, onlykeys)
0521     validate(tp, onlysrcs, onlykeys, options.demoexp, options.expwkeys)
0522 
0523     if options.statistics:
0524         _statistics(tp, onlysrcs, onlykeys)
0525 
0526 
0527 if __name__ == '__main__':
0528     _main()
0529