pology/sieve/stats.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Catalog statistics: message and word counts, etc.
0005
0006 Documented in C{doc/user/sieving.docbook}.
0007
0008 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0009 @license: GPLv3
0010 """
0011
0012 import codecs
0013 import locale
0014 import os
0015 import sys
0016
0017 from pology import _, n_
0018 from pology.catalog import Catalog
0019 from pology.message import MessageUnsafe
0020 from pology.colors import ColorString, cjoin, cinterp
0021 from pology.comments import parse_summit_branches
0022 from pology.diff import tdiff
0023 from pology.fsops import collect_catalogs
0024 from pology.getfunc import get_hook_ireq
0025 from pology.report import report, warning, format_item_list
0026 from pology.split import proper_words
0027 from pology.tabulate import tabulate
0028 from pology.sieve import SieveError
0029
0030
0031 def setup_sieve (p):
0032
0033     p.set_desc(_("@info sieve discription",
0034     "Compute translation statistics.\n"
0035     "\n"
0036     "Provides basic count of number of messages by type (translated, fuzzy, "
0037     "etc.), along with words and character counts, and some other derived "
0038     "statistics on request."
0039     ))
0040
0041     p.add_param("accel", str, multival=True,
0042                 metavar=_("@info sieve parameter value placeholder", "CHAR"),
0043                 desc=_("@info sieve parameter discription",
0044     "Character which is used as UI accelerator marker in text fields, "
0045     "to remove it before counting. "
0046     "If a catalog defines accelerator marker in the header, "
0047     "this value overrides it."
0048     ))
0049     p.add_param("detail", bool, defval=False,
0050                 desc=_("@info sieve parameter discription",
0051     "Compute and display some derived statistical quantities."
0052     ))
0053     p.add_param("incomplete", bool, defval=False,
0054                 desc=_("@info sieve parameter discription",
0055     "List catalogs which are not fully translated, with incompletness counts."
0056     ))
0057     p.add_param("incompfile", str,
0058                 metavar=_("@info sieve parameter value placeholder", "FILE"),
0059                 desc=_("@info sieve parameter discription",
0060     "Write paths of catalogs that are not fully translated into a file, "
0061     "one per line."
0062     ))
0063     p.add_param("templates", str,
0064                 metavar=_("@info sieve parameter value placeholder",
0065                           "FIND:REPLACE"),
0066                 desc=_("@info sieve parameter discription",
0067     "Count in templates without a corresponding catalog (i.e. translation on "
0068     "it has not started yet) into statistics. "
0069     "Assumes that translated catalogs and templates live in two root "
0070     "directories with same structure; then for each path of an existing "
0071     "catalog, its directory is taken and the path to corresponding templates "
0072     "directory constructed by replacing first occurence of FIND with REPLACE."
0073     ))
0074     p.add_param("branch", str, seplist=True,
0075                 metavar=_("@info sieve parameter value placeholder", "BRANCH"),
0076                 desc=_("@info sieve parameter discription",
0077     "In summit catalogs, count in only messages belonging to given branch. "
0078     "Several branches can be given as comma-separated list."
0079     ))
0080     p.add_param("maxwords", int,
0081                 metavar=_("@info sieve parameter value placeholder", "NUMBER"),
0082                 desc=_("@info sieve parameter discription",
0083     "Count in only messages which have at most this many words, "
0084     "either in original or translation."
0085     ))
0086     p.add_param("minwords", int,
0087                 metavar=_("@info sieve parameter value placeholder", "NUMBER"),
0088                 desc=_("@info sieve parameter discription",
0089     "Count in only messages which have at least this many words, "
0090     "either in original or translation."
0091     ))
0092     p.add_param("lspan", str,
0093                 metavar=_("@info sieve parameter value placeholder", "FROM:TO"),
0094                 desc=_("@info sieve parameter discription",
0095     "Count in only messages at or after line FROM, and before line TO. "
0096     "If FROM is empty, 0 is assumed; "
0097     "if TO is empty, total number of lines is assumed."
0098     ))
0099     p.add_param("espan", str,
0100                 metavar=_("@info sieve parameter value placeholder", "FROM:TO"),
0101                 desc=_("@info sieve parameter discription",
0102     "Count in only messages at or after entry FROM, and before entry TO. "
0103     "If FROM is empty, 0 is assumed; "
0104     "if TO is empty, total number of entries is assumed."
0105     ))
0106     p.add_param("bydir", bool, defval=False,
0107                 desc=_("@info sieve parameter discription",
0108     "Report statistics per leaf directory in searched paths."
0109     ))
0110     p.add_param("byfile", bool, defval=False,
0111                 desc=_("@info sieve parameter discription",
0112     "Report statistics per catalog."
0113     ))
0114     p.add_param("wbar", bool, defval=False,
0115                 desc=_("@info sieve parameter discription",
0116     "Show statistics in form of word bars."
0117     ))
0118     p.add_param("msgbar", bool, defval=False,
0119                 desc=_("@info sieve parameter discription",
0120     "Show statistics in form of message bars."
0121     ))
0122     p.add_param("msgfmt", bool, defval=False,
0123                 desc=_("@info sieve parameter discription",
0124     "Show a minimal summary of the statistics (like msgfmt)."
0125     ))
0126     p.add_param("absolute", bool, defval=False,
0127                 desc=_("@info sieve parameter discription",
0128     "Scale lengths of word and message bars to numbers they represent, "
0129     "rather than relative to percentage of translation state. "
0130     "Useful with '%(par1)s' and '%(par2)s' parameters, "
0131     "to compare sizes of different translation units.",
0132     par1="byfile", par2="bydir"
0133     ))
0134     p.add_param("ondiff", bool, defval=False,
0135                 desc=_("@info sieve parameter discription",
0136     "Split word and character counts of fuzzy messages "
0137     "into translated and untranslated categories (leaving zero in fuzzy), "
0138     "based on difference ratio between current and previous original text."
0139     ))
0140     p.add_param("mincomp", float, defval=None,
0141                 metavar=_("@info sieve parameter value placeholder", "RATIO"),
0142                 desc=_("@info sieve parameter discription",
0143     "Include into statistics only catalogs with sufficient completeness, "
0144     "as ratio of translated to other messages (real value between 0 and 1)."
0145     ))
0146     p.add_param("filter", str, multival=True,
0147                 metavar=_("@info sieve parameter value placeholder", "HOOK"),
0148                 desc=_("@info sieve parameter discription",
0149     "F1A hook specification, to filter the translation through. "
0150     "Several filters can be specified by repeating the parameter."
0151     ))
0152
0153
0154 class Sieve (object):
0155
0156     def __init__ (self, params):
0157
0158         self.p = params
0159
0160         # Templates correspondence.
0161         # Mapping of catalogs to templates, in form of <search>:<replace>.
0162         # For each catalog file path, the first <search> substring is replaced
0163         # by <replace>, and .po replaced with .pot, to construct its template
0164         # file path. All templates not found under such paths are reported.
0165         # Furthermore, all subdirs of these paths are searched for templates
0166         # without corresponding catalogs, and every such template is counted
0167         # as fully untranslated PO.
0168         if self.p.templates:
0169             if ":" not in self.p.templates:
0170                 self.tspec_srch = self.p.templates
0171                 self.tspec_repl = ""
0172             else:
0173                 self.tspec_srch, self.tspec_repl = self.p.templates.split(":", 1)
0174
0175         # Turn off table display if a bar view has been selected.
0176         self.p.table = True
0177         if self.p.msgbar or self.p.wbar or self.p.msgfmt:
0178             self.p.table = False
0179
0180         # Filenames of catalogs which are not fully translated.
0181         self.incomplete_catalogs = {}
0182
0183         # Counted categories.
0184         self.count_spec = (
0185             ("trn",
0186              _("@title:row translated messages/words/characters",
0187                "translated")),
0188             ("fuz",
0189              _("@title:row fuzzy messages/words/characters",
0190                "fuzzy")),
0191             ("unt",
0192              _("@title:row untranslated messages/words/characters",
0193                "untranslated")),
0194             ("tot",
0195              _("@title:row fuzzy messages/words/characters",
0196                "total")),
0197             ("obs",
0198              _("@title:row fuzzy messages/words/characters",
0199                "obsolete")),
0200         )
0201
0202         # FIXME: After parameter parser can deliver requested sequence type.
0203         if self.p.branch is not None:
0204             self.p.branch = set(self.p.branch)
0205
0206         # Parse line/entry spans.
0207         def parse_span (spanspec):
0208             lst = spanspec is not None and spanspec.split(":") or ("", "")
0209             if len(lst) != 2:
0210                 raise SieveError(
0211                     _("@info",
0212                       "Wrong number of elements in span "
0213                       "specification '%(spec)s'.",
0214                       spec=self.p.lspan))
0215             nlst = []
0216             for el in lst:
0217                 if not el:
0218                     nlst.append(None)
0219                 else:
0220                     try:
0221                         nlst.append(int(el))
0222                     except:
0223                         raise SieveError(
0224                             _("@info",
0225                               "Not an integer number in span "
0226                               "specification '%(spec)s'.",
0227                               spec=self.p.lspan))
0228             return tuple(nlst)
0229         self.lspan = parse_span(self.p.lspan)
0230         self.espan = parse_span(self.p.espan)
0231
0232         # Number of counts per category:
0233         # messages, words in original, words in translation,
0234         # characters in original, characters in translation.
0235         self.counts_per_cat = 5
0236
0237         # Category counts per catalog filename.
0238         self.counts = {}
0239
0240         # Collections of all confirmed templates and tentative template subdirs.
0241         self.matched_templates = {}
0242         self.template_subdirs = []
0243         if self.p.templates:
0244             for rpath in params.root_paths:
0245                 if os.path.isfile(rpath):
0246                     rpath = os.path.dirname(rpath)
0247                 rpath = rpath.replace(self.tspec_srch, self.tspec_repl, 1)
0248                 self.template_subdirs.append(rpath)
0249         # Map of template to translation subdirs.
0250         self.mapped_template_subdirs = {}
0251
0252         # Some indicators of metamessages.
0253         self.xml2po_meta_msgid = dict([(x, True) for x in
0254             ("translator-credits",)])
0255         self.xml2pot_meta_msgid = dict([(x, True) for x in
0256             ("ROLES_OF_TRANSLATORS", "CREDIT_FOR_TRANSLATORS")])
0257         self.kde_meta_msgctxt = dict([(x, True) for x in
0258             ("NAME OF TRANSLATORS", "EMAIL OF TRANSLATORS")])
0259
0260         # Resolve filtering hooks.
0261         self.pfilters = []
0262         for hreq in self.p.filter or []:
0263             self.pfilters.append(get_hook_ireq(hreq, abort=True))
0264
0265         # Indicators to the caller:
0266         self.caller_sync = False # no need to sync catalogs
0267         self.caller_monitored = False # no need for monitored messages
0268
0269
0270     def _count_zero (self):
0271
0272         return dict([(x[0], [0] * self.counts_per_cat)
0273                      for x in self.count_spec])
0274
0275
0276     def _count_sum (self, c1, c2):
0277
0278         cs = self._count_zero()
0279         for cat, catname in self.count_spec:
0280             for i in range(self.counts_per_cat):
0281                 cs[cat][i] = c1[cat][i] + c2[cat][i]
0282
0283         return cs
0284
0285
0286     def process_header (self, hdr, cat):
0287
0288         # Establish counts for this file.
0289         if cat.filename not in self.counts:
0290             self.counts[cat.filename] = self._count_zero()
0291         self.count = self.counts[cat.filename]
0292
0293         # If template correspondence requested, handle template matching.
0294         if (    self.p.templates
0295             and not cat.filename.endswith(".pot")):
0296
0297             # Construct expected template path.
0298             tpath = cat.filename.replace(self.tspec_srch, self.tspec_repl, 1)
0299             pdot = tpath.rfind(".")
0300             if pdot >= 0:
0301                 tpath = tpath[:pdot] + ".pot"
0302             # Inform if the template does not exist.
0303             if not os.path.isfile(tpath):
0304                 warning(_("@info",
0305                           "Expected template catalog '%(file)s' is missing.",
0306                           file=tpath))
0307             # Indicate the template has been matched.
0308             if tpath not in self.matched_templates:
0309                 self.matched_templates[tpath] = True
0310
0311         # Force explicitly given accelerators.
0312         if self.p.accel is not None:
0313             cat.set_accelerator(self.p.accel)
0314
0315
0316     def process (self, msg, cat):
0317
0318         # Summit: if branches were given, skip the message if it does not
0319         # belong to any of the given branches.
0320         if self.p.branch:
0321             msg_branches = parse_summit_branches(msg)
0322             if not set.intersection(self.p.branch, msg_branches):
0323                 return
0324
0325         # If line/entry spans given, skip message if not in range.
0326         if self.lspan[0] is not None and msg.refline < self.lspan[0]:
0327             return
0328         if self.lspan[1] is not None and msg.refline >= self.lspan[1]:
0329             return
0330         if self.espan[0] is not None and msg.refentry < self.espan[0]:
0331             return
0332         if self.espan[1] is not None and msg.refentry >= self.espan[1]:
0333             return
0334
0335         # Decide if a metamessage:
0336         ismeta = False
0337         # - msgid in form "@@<tag>: ..." from xml2po
0338         if msg.msgid.startswith("@@"):
0339             ps = msg.msgid.find(":")
0340             ismeta = (ps >= 0 and msg.msgid[2:ps].isalnum())
0341         # - translator credits from xml2po and xml2pot
0342         if (   msg.msgid in self.xml2po_meta_msgid
0343             or msg.msgid in self.xml2pot_meta_msgid
0344         ):
0345             ismeta = True
0346         # - translator credits in KDE GUI
0347         if msg.msgctxt in self.kde_meta_msgctxt:
0348             ismeta = True
0349
0350         # Prepare filtered message for counting.
0351         if self.pfilters:
0352             msg = MessageUnsafe(msg)
0353             for pfilter in self.pfilters:
0354                 for i in range(len(msg.msgstr)):
0355                     msg.msgstr[i] = pfilter(msg.msgstr[i])
0356
0357         # Count the words and characters in original and translation.
0358         # Remove shortcut markers prior to counting; don't include words
0359         # which do not start with a letter; remove scripted part.
0360         # For plural messages compute averages of msgid and msgstr groups,
0361         # to normalize comparative counts on varying number of plural forms.
0362         nwords = {"orig" : 0, "tran" : 0}
0363         nchars = {"orig" : 0, "tran" : 0}
0364         msgids = [msg.msgid]
0365         if msg.msgid_plural is not None:
0366             msgids.append(msg.msgid_plural)
0367         for src, texts in (("orig", msgids), ("tran", msg.msgstr)):
0368             if ismeta: # consider metamessages as zero counts
0369                 continue
0370             lnwords = [] # this group's word count, for averaging
0371             lnchars = [] # this group's character count, for averaging
0372             for text in texts:
0373                 pf = text.find("|/|")
0374                 if pf >= 0:
0375                     text = text[0:pf]
0376                 words = proper_words(text, True, cat.accelerator(), msg.format)
0377                 # If there are no proper words but there are some characters,
0378                 # set to one empty word in order for a fuzzy or
0379                 # an untranslated message not to be considered translated
0380                 # when only word counts are observed.
0381                 if not words and text:
0382                     words = [""]
0383                 lnwords.append(len(words))
0384                 lnchars.append(len("".join(words)))
0385             nwords[src] += int(round(float(sum(lnwords)) / len(texts)))
0386             nchars[src] += int(round(float(sum(lnchars)) / len(texts)))
0387             #nchars[src] += (nwords[src] - 1) # nominal space per each two words
0388
0389         # If the number of words has been limited, skip the message if it
0390         # does not fall in the range.
0391         if self.p.maxwords is not None:
0392             if not (   nwords["orig"] <= self.p.maxwords
0393                     or nwords["tran"] <= self.p.maxwords):
0394                 return
0395         if self.p.minwords is not None:
0396             if not (   nwords["orig"] >= self.p.minwords
0397                     or nwords["tran"] >= self.p.minwords):
0398                 return
0399
0400         # Split word and character counts in fuzzy original if requested.
0401         nswords = {}
0402         nschars = {}
0403         if self.p.ondiff and msg.fuzzy and msg.msgid_previous is not None:
0404             diff, dr = tdiff(msg.msgid_previous, msg.msgid, diffr=True)
0405             # Reduce difference ratio to a smaller range by some threshold.
0406             # Texts more different than the threshold need full review.
0407             drth = 0.4
0408             #dr2 = dr if dr < drth else 1.0
0409             dr2 = min(dr / drth, 1.0)
0410             # Split counts between primary fuzzy count, and secondary
0411             # translated, so that total remains the same.
0412             nswords.update({"trn": {}, "fuz": {}, "unt": {}})
0413             nschars.update({"trn": {}, "fuz": {}, "unt": {}})
0414             for nitems, nitems2, src in (
0415                 (nwords, nswords, "orig"), (nwords, nswords, "tran"),
0416                 (nchars, nschars, "orig"), (nchars, nschars, "tran"),
0417             ):
0418                 num = nitems[src]
0419                 # Difference ratio of 0 can happen if the new and old texts
0420                 # are the same, normally when only the context has changed.
0421                 # Fuzzy counts should not be totally eliminated then,
0422                 # as it should be seen that message needs updating.
0423                 if dr2 > 0.0:
0424                     rnum = int(round(dr2 * num + 0.5)) # round up
0425                 else:
0426                     rnum = 1
0427                 rnum = min(rnum, num) # in case of rounding overflow
0428                 nitems2["trn"][src] = num - rnum
0429                 nitems2["fuz"][src] = 0
0430                 nitems2["unt"][src] = rnum
0431
0432         # Detect categories and add the counts.
0433         categories = set()
0434
0435         if not msg.obsolete: # do not count obsolete into totals
0436             self.count["tot"][0] += 1
0437             categories.add("tot")
0438             if nswords:
0439                 categories.update(list(nswords.keys()))
0440
0441         if msg.obsolete: # do not split obsolete into fuzzy/translated
0442             self.count["obs"][0] += 1
0443             categories.add("obs")
0444             nswords = {}
0445             nschars = {}
0446         elif msg.translated:
0447             self.count["trn"][0] += 1
0448             categories.add("trn")
0449         elif msg.fuzzy:
0450             self.count["fuz"][0] += 1
0451             categories.add("fuz")
0452             if cat.filename not in self.incomplete_catalogs:
0453                 self.incomplete_catalogs[cat.filename] = True
0454         elif msg.untranslated:
0455             self.count["unt"][0] += 1
0456             categories.add("unt")
0457             if cat.filename not in self.incomplete_catalogs:
0458                 self.incomplete_catalogs[cat.filename] = True
0459
0460         for cat in categories:
0461             nwords1 = nswords.get(cat, nwords)
0462             nchars1 = nschars.get(cat, nchars)
0463             self.count[cat][1] += nwords1["orig"]
0464             self.count[cat][2] += nwords1["tran"]
0465             self.count[cat][3] += nchars1["orig"]
0466             self.count[cat][4] += nchars1["tran"]
0467
0468
0469     # Sort filenames as if templates-only were within language subdirs.
0470     def _sort_equiv_filenames (self, filenames):
0471
0472         def equiv_template_path (x):
0473             cdir = os.path.dirname(x)
0474             if cdir in self.mapped_template_subdirs:
0475                 cdir = self.mapped_template_subdirs[cdir]
0476                 return os.path.join(cdir, os.path.basename(x))
0477             else:
0478                 return x
0479
0480         filenames.sort(key=lambda x: equiv_template_path(x))
0481
0482
0483     def finalize (self):
0484
0485         # If template correspondence requested, handle POTs without POs.
0486         if self.template_subdirs:
0487             # Collect all catalogs in template subdirs.
0488             tpaths = collect_catalogs(self.template_subdirs)
0489             tpaths = list(filter(self.p.is_cat_included, tpaths))
0490             # Filter to have only POTs remain.
0491             tpaths = [x for x in tpaths if x.endswith(".pot")]
0492             # Filter to leave out matched templates.
0493             tpaths = [x for x in tpaths if x not in self.matched_templates]
0494             # Add stats on all unmatched templates.
0495             for tpath in tpaths:
0496                 cat = Catalog(tpath, monitored=False)
0497                 self.process_header(cat.header, cat)
0498                 for msg in cat:
0499                     self.process(msg, cat)
0500             # Map template to translation subdirs.
0501             for tpath in tpaths:
0502                 tsubdir = os.path.dirname(tpath)
0503                 subdir = tsubdir.replace(self.tspec_repl, self.tspec_srch, 1)
0504                 self.mapped_template_subdirs[tsubdir] = subdir
0505
0506         # If completeness limit in effect, eliminate catalogs not passing it.
0507         if self.p.mincomp is not None:
0508             ncounts = {}
0509             ninccats = {}
0510             for filename, count in self.counts.items():
0511                 cr = float(count["trn"][0]) / (count["tot"][0] or 1)
0512                 if cr >= self.p.mincomp:
0513                     ncounts[filename] = count
0514                     inccat = self.incomplete_catalogs.get(filename)
0515                     if inccat is not None:
0516                         ninccats[filename] = inccat
0517             self.counts = ncounts
0518             self.incomplete_catalogs = ninccats
0519
0520         # Assemble sets of total counts by requested divisions.
0521         count_overall = self._count_zero()
0522         counts_bydir = {}
0523         filenames_bydir = {}
0524         for filename, count in self.counts.items():
0525
0526             count_overall = self._count_sum(count_overall, count)
0527
0528             if self.p.bydir:
0529                 cdir = os.path.dirname(filename)
0530                 if cdir in self.mapped_template_subdirs:
0531                     # Pretend templates-only are within language subdir.
0532                     cdir = self.mapped_template_subdirs[cdir]
0533                 if cdir not in counts_bydir:
0534                     counts_bydir[cdir] = self._count_zero()
0535                     filenames_bydir[cdir] = []
0536                 counts_bydir[cdir] = self._count_sum(counts_bydir[cdir], count)
0537                 filenames_bydir[cdir].append(filename)
0538
0539         # Arrange sets into ordered list with titles.
0540         counts = []
0541         if self.p.bydir:
0542             cdirs = list(counts_bydir.keys());
0543             cdirs.sort()
0544             for cdir in cdirs:
0545                 if self.p.byfile:
0546                     self._sort_equiv_filenames(filenames_bydir[cdir])
0547                     for filename in filenames_bydir[cdir]:
0548                         counts.append((filename, self.counts[filename], False))
0549                 counts.append(("%s/" % cdir, counts_bydir[cdir], False))
0550             counts.append((_("@item:intable sum of all other entries",
0551                              "(overall)"), count_overall, True))
0552
0553         elif self.p.byfile:
0554             filenames = list(self.counts.keys())
0555             self._sort_equiv_filenames(filenames)
0556             for filename in filenames:
0557                 counts.append((filename, self.counts[filename], False))
0558             counts.append((_("@item:intable sum of all other entries",
0559                              "(overall)"), count_overall, True))
0560
0561         else:
0562             counts.append((None, count_overall, False))
0563
0564         # Indicate conspicuously up front modifiers to counting.
0565         modstrs = []
0566         if self.p.branch:
0567             fmtbranches = format_item_list(self.p.branch)
0568             modstrs.append(_("@item:intext",
0569                              "branches (%(branchlist)s)",
0570                              branchlist=fmtbranches))
0571         if self.p.maxwords is not None and self.p.minwords is None:
0572             modstrs.append(n_("@item:intext",
0573                               "at most %(num)d word",
0574                               "at most %(num)d words",
0575                               num=self.p.maxwords))
0576         if self.p.minwords is not None and self.p.maxwords is None:
0577             modstrs.append(n_("@item:intext",
0578                               "at least %(num)d word",
0579                               "at least %(num)d words",
0580                               num=self.p.minwords))
0581         if self.p.minwords is not None and self.p.maxwords is not None:
0582             modstrs.append(n_("@item:intext",
0583                               "from %(num1)d to %(num)d word",
0584                               "from %(num1)d to %(num)d words",
0585                               num1=self.p.minwords, num=self.p.maxwords))
0586         if self.p.lspan:
0587             modstrs.append(_("@item:intext",
0588                              "line span %(span)s",
0589                              span=self.p.lspan))
0590         if self.p.espan:
0591             modstrs.append(_("@item:intext",
0592                              "entry span %(span)s",
0593                              span=self.p.espan))
0594         if self.p.ondiff:
0595             modstrs.append(_("@item:intext",
0596                              "scaled fuzzy counts"))
0597
0598         # Should titles be output in-line or on separate lines.
0599         self.inline = False
0600         maxtitlecw = 0
0601         if (not self.p.wbar or not self.p.msgbar or not self.p.msgfmt) and (not self.p.table):
0602             for title, count, summed in counts:
0603                 if title is not None:
0604                     self.inline = True
0605                     titlecw = len(title)
0606                     if maxtitlecw < titlecw:
0607                         maxtitlecw = titlecw
0608
0609         # Output statistics in requested forms.
0610         for title, count, summed in counts:
0611             # Output the title if defined.
0612             if title is not None:
0613                 if self.inline:
0614                     ntitle = (("%%-%ds" % maxtitlecw) % title)
0615                 else:
0616                     ntitle = title
0617                 # Must color after padding, to avoid it seeing the colors.
0618                 ntitle = _("@title",
0619                            "<bold>%(title)s</bold>",
0620                            title=ntitle)
0621                 if self.inline:
0622                     report(ntitle + " ", newline=False)
0623                 else:
0624                     report(ntitle)
0625
0626             if self.p.table:
0627                 self._tabular_stats(counts, title, count)
0628             if self.p.msgbar:
0629                 self._msg_bar_stats(counts, title, count, summed)
0630             if self.p.wbar:
0631                 self._w_bar_stats(counts, title, count, summed)
0632             if self.p.msgfmt:
0633                 self._msg_simple_stats(title, count, summed)
0634
0635         # Output the table of catalogs which are not fully translated,
0636         # if requested.
0637         if self.p.incomplete and self.incomplete_catalogs:
0638             filenames = list(self.incomplete_catalogs.keys())
0639             self._sort_equiv_filenames(filenames)
0640             data = []
0641             # Column of catalog filenames.
0642             data.append(filenames)
0643             data.append([self.counts[x]["fuz"][0] for x in filenames])
0644             data.append([self.counts[x]["unt"][0] for x in filenames])
0645             data.append([x + y for x, y in zip(data[1], data[2])])
0646             data.append([self.counts[x]["fuz"][1] for x in filenames])
0647             data.append([self.counts[x]["unt"][1] for x in filenames])
0648             data.append([x + y for x, y in zip(data[4], data[5])])
0649             # Columns of the two added.
0650             # Column names and formats.
0651             coln = [_("@title:column",
0652                       "catalog"),
0653                     _("@title:column fuzzy messages",
0654                       "msg/f"),
0655                     _("@title:column untranslated messages",
0656                       "msg/u"),
0657                     _("@title:column fuzzy and untranslated messages",
0658                       "msg/f+u"),
0659                     _("@title:column words in fuzzy messages",
0660                       "w/f"),
0661                     _("@title:column words in untranslated messages",
0662                       "w/u"),
0663                     _("@title:column words in fuzzy and untranslated messages",
0664                       "w/f+u")]
0665             maxfl = max([len(x) for x in filenames])
0666             dfmt = ["%%-%ds" % maxfl, "%d", "%d", "%d", "%d", "%d", "%d"]
0667             # Output.
0668             report("-")
0669             report(tabulate(data, coln=coln, dfmt=dfmt, space="   ", none="-",
0670                             colorize=True))
0671
0672         # Write file names of catalogs which are not fully translated
0673         # into a file, if requested.
0674         if self.p.incompfile:
0675             filenames = sorted(self.incomplete_catalogs.keys())
0676             cmdlenc = locale.getpreferredencoding()
0677             ofl = codecs.open(self.p.incompfile, "w", cmdlenc)
0678             ofl.writelines([x + "\n" for x in filenames])
0679             ofl.close()
0680
0681         if modstrs:
0682             report(_("@item:intable",
0683                      "modifiers: %(modlist)s",
0684                      modlist=format_item_list(modstrs)))
0685
0686
0687     def _tabular_stats (self, counts, title, count):
0688
0689         # Order counts in tabular form.
0690         selected_cats = self.count_spec
0691         if False and self.p.incomplete: # skip this for the moment
0692             # Display only fuzzy and untranslated counts.
0693             selected_cats = (self.count_spec[1], self.count_spec[2])
0694             # Skip display if complete.
0695             really_incomplete = True
0696             for tkey, tname in selected_cats:
0697                 for col in range(self.counts_per_cat):
0698                     if count[tkey][col] > 0:
0699                         really_incomplete = False
0700                         break
0701             if really_incomplete:
0702                 return
0703         data = [[count[tkey][y] for tkey, tname in selected_cats]
0704                 for y in range(self.counts_per_cat)]
0705
0706         # Derived data: messages/words completition ratios.
0707         for col, ins in ((0, 1), (1, 3)):
0708             compr = []
0709             for tkey, tname in selected_cats:
0710                 if tkey not in ("tot", "obs") and count["tot"][col] > 0:
0711                     r = float(count[tkey][col]) / count["tot"][col]
0712                     compr.append(r * 100)
0713                 else:
0714                     compr.append(None)
0715             data.insert(ins, compr)
0716
0717         if self.p.detail:
0718             # Derived data: word and character expansion factors.
0719             for o, t, ins, incsp in ((1, 2, 7, None), (3, 4, 8, (1, 2, 0.0))):
0720                 ratio = []
0721                 for tkey, tname in selected_cats:
0722                     if count[tkey][o] > 0 and count[tkey][t] > 0:
0723                         inct, inco = 0.0, 0.0
0724                         if incsp:
0725                             co, ct, fact = incsp
0726                             inco = (count[tkey][co] - 1) * fact
0727                             inct = (count[tkey][ct] - 1) * fact
0728                         r = (count[tkey][t] + inct) / (count[tkey][o] + inco)
0729                         ratio.append((r - 1) * 100)
0730                     else:
0731                         ratio.append(None)
0732                 data.insert(ins, ratio)
0733
0734         if self.p.detail:
0735             # Derived data: character/word ratio, word/message ratio.
0736             for w, c, ins in ((0, 1, 9), (0, 2, 10), (1, 3, 11), (2, 4, 12)):
0737                 chpw = []
0738                 for tkey, tname in selected_cats:
0739                     if count[tkey][w] > 0 and count[tkey][c] > 0:
0740                         r = float(count[tkey][c]) / count[tkey][w]
0741                         chpw.append(r)
0742                     else:
0743                         chpw.append(None)
0744                 data.insert(ins, chpw)
0745
0746         # Row, column names and formats.
0747         rown = [tname for tkey, tname in selected_cats]
0748         coln = [_("@title:column messages",
0749                   "msg"),
0750                 _("@title:column percentage of total messages",
0751                   "msg/tot"),
0752                 _("@title:column words in original",
0753                   "w-or"),
0754                 _("@title:column percentage of words to total in original",
0755                   "w/tot-or"),
0756                 _("@title:column words in translation",
0757                   "w-tr"),
0758                 _("@title:column characters in original",
0759                   "ch-or"),
0760                 _("@title:column characters in translation",
0761                   "ch-tr")]
0762         dfmt = ["%d", "%.1f%%",
0763                 "%d", "%.1f%%", "%d", "%d", "%d"]
0764         if self.p.detail:
0765             coln.extend([_("@title:column word efficiency",
0766                            "w-ef"),
0767                          _("@title:column character efficiency",
0768                            "ch-ef"),
0769                          _("@title:column words per message in original",
0770                            "w/msg-or"),
0771                          _("@title:column words per message in translation",
0772                            "w/msg-tr"),
0773                          _("@title:column characters per message in original",
0774                            "ch/w-or"),
0775                          _("@title:column characters per message in translation",
0776                            "ch/w-tr")])
0777             dfmt.extend(["%+.1f%%", "%+.1f%%",
0778                          "%.1f", "%.1f", "%.1f", "%.1f"])
0779
0780         # Output the table.
0781         report(tabulate(data, rown=rown, coln=coln, dfmt=dfmt,
0782                         space="   ", none="-", colorize=True))
0783
0784
0785     def _msg_bar_stats (self, counts, title, count, summed):
0786
0787         self._bar_stats(counts, title, count, summed,
0788                         _("@item:intable number of messages",
0789                           "msgs"),
0790                         0)
0791
0792
0793     def _w_bar_stats (self, counts, title, count, summed):
0794
0795         self._bar_stats(counts, title, count, summed,
0796                         _("@item:intable number of words in original",
0797                           "w-or"),
0798                         1)
0799
0800
0801     def _bar_stats (self, counts, title, count, summed, dlabel, dcolumn):
0802
0803         # Count categories to display and chars/colors associated to them.
0804         # Note: Use only characters from Latin1.
0805         tspecs = (("trn", "×", "green"),
0806                   ("fuz", "¤", "blue"),
0807                   ("unt", "·", "red"))
0808
0809         # Find out maximum counts overall.
0810         maxcounts = dict(trn=0, fuz=0, unt=0, tot=0)
0811         maxcounts_jumbled = maxcounts.copy()
0812         for otitle, ocount, osummed in counts:
0813             # If absolute bars, compare counts only for non-summed counts.
0814             if self.p.absolute and osummed:
0815                 continue
0816
0817             # Count both messages and words, for the number display padding.
0818             for tkey in maxcounts_jumbled:
0819                 for dcol in (0, 1):
0820                     c = ocount[tkey][dcol]
0821                     if maxcounts_jumbled[tkey] < c:
0822                         maxcounts_jumbled[tkey] = c
0823
0824             for tkey in maxcounts:
0825                 c = ocount[tkey][dcolumn]
0826                 if maxcounts[tkey] < c:
0827                     maxcounts[tkey] = c
0828
0829         # Character widths of maximum count categories.
0830         maxcountscw = {}
0831         for tkey, tval in maxcounts.items():
0832             maxcountscw[tkey] = len(str(tval))
0833         maxcountscw_jumbled = {}
0834         for tkey, tval in maxcounts_jumbled.items():
0835             maxcountscw_jumbled[tkey] = len(str(tval))
0836
0837         # Formatted counts by disjunct categories.
0838         fmt_counts = []
0839         for tkey, tchar, tcol in tspecs:
0840             cstr = str(count[tkey][dcolumn])
0841             if cstr == "0":
0842                 cstr = "-"
0843             cfmt = ("%%%ds" % maxcountscw_jumbled[tkey]) % cstr
0844             if tcol is not None:
0845                 fmt_counts.append((ColorString("<%s>%%s</%s>") % (tcol, tcol))
0846                                   % cfmt)
0847             else:
0848                 fmt_counts.append(cfmt)
0849         fmt_counts = cjoin(fmt_counts, "/")
0850
0851         # Maximum and nominal bar widths in characters.
0852         # TODO: Make parameters.
0853         if self.inline:
0854             nombarcw = 20
0855             maxbarcw = 50
0856         else:
0857             nombarcw = 40
0858             maxbarcw = 80
0859
0860         def roundnear (x):
0861             return int(round(x, 0))
0862
0863         def roundup (x):
0864             ix = int(x)
0865             if x - ix > 1e-16:
0866                 ix += 1
0867             return ix
0868
0869         # Compute number of cells per category.
0870         n_cells = {}
0871         if self.p.absolute:
0872             # Absolute bar.
0873             n_per_cell = 0
0874             for npc in (1, 2, 5,
0875                         10, 20, 50,
0876                         100, 200, 500,
0877                         1000, 2000, 5000,
0878                         10000, 20000, 50000,
0879                         100000, 200000, 500000):
0880                 if npc * maxbarcw > maxcounts["tot"]:
0881                     n_per_cell = npc
0882                     break
0883             if not n_per_cell:
0884                 warning(_("@info",
0885                           "Count too large, cannot display bar graph."))
0886                 return
0887             for tkey, roundf in (("fuz", roundup), ("unt", roundup),
0888                                  ("tot", roundnear)):
0889                 c = count[tkey][dcolumn]
0890                 n_cells[tkey] = roundf(float(c) / n_per_cell)
0891
0892             # Correct the situation when there are no cells.
0893             if n_cells["tot"] < 1:
0894                 n_cells["tot"] = 1
0895
0896             # Correct the situation when the sum of cells fuzzy+untranslated
0897             # goes over the total; give priority to untranslated when reducing.
0898             while n_cells["fuz"] + n_cells["unt"] > n_cells["tot"]:
0899                 if n_cells["fuz"] >= n_cells["unt"]:
0900                     n_cells["fuz"] -= 1
0901                 else:
0902                     n_cells["unt"] -= 1
0903
0904             n_cells["trn"] = n_cells["tot"] - n_cells["fuz"] - n_cells["unt"]
0905
0906         else:
0907             # Relative bar.
0908             if count["tot"][dcolumn] > 0:
0909                 n_per_cell = float(nombarcw) / count["tot"][dcolumn]
0910             else:
0911                 n_per_cell = 0
0912             for tkey in ("fuz", "unt"):
0913                 c = count[tkey][dcolumn]
0914                 n_cells[tkey] = roundup(c * n_per_cell)
0915
0916             # When there are almost none translated, it may have happened that
0917             # the sum of cells fuzzy+untranslated is over nominal; reduce.
0918             while n_cells["fuz"] + n_cells["unt"] > nombarcw:
0919                 if n_cells["fuz"] >= n_cells["unt"]:
0920                     n_cells["fuz"] -= 1
0921                 else:
0922                     n_cells["unt"] -= 1
0923
0924             n_cells["trn"] = nombarcw - n_cells["fuz"] - n_cells["unt"]
0925
0926         # Create the bar.
0927         fmt_bar = []
0928         for tkey, tchar, tcol in tspecs:
0929             bar = tchar * n_cells[tkey]
0930             if tcol is not None:
0931                 bar = (ColorString("<%s>%%s</%s>") % (tcol, tcol)) % bar
0932             fmt_bar.append(bar)
0933         fmt_bar = cjoin(fmt_bar)
0934
0935         # Assemble final output.
0936         if not self.p.absolute or not summed:
0937             if count["tot"][dcolumn] == 0:
0938                 fmt_bar = ""
0939             report(cinterp("%s %s |%s|", fmt_counts, dlabel, fmt_bar))
0940         else:
0941             report(cinterp("%s %s", fmt_counts, dlabel))
0942
0943
0944     def _msg_simple_stats (self, title, count, summed):
0945         """ msgfmt-style report """
0946         fmt_trn = n_("@item:intext",
0947                      "%(num)d translated message",
0948                      "%(num)d translated messages",
0949                      num=count["trn"][0])
0950         fmt_fuz = n_("@item:intext",
0951                      "%(num)d fuzzy translation",
0952                      "%(num)d fuzzy translations",
0953                      num=count["fuz"][0])
0954         fmt_unt = n_("@item:intext",
0955                      "%(num)d untranslated message",
0956                      "%(num)d untranslated messages",
0957                      num=count["unt"][0])
0958         report(_("@info composition of three previous messages",
0959                  "%(trn)s, %(fuz)s, %(unt)s",
0960                  trn=fmt_trn, fuz=fmt_fuz, unt=fmt_unt))
0961