pology/pology/normalize.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Various normalizations for strings and PO elements.
0005
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009
0010 import os
0011 import re
0012 import unicodedata
0013
0014 from pology import _, n_
0015 from pology.message import MessageUnsafe
0016 from pology.monitored import Monlist, Monpair
0017 from pology.report import warning
0018
0019
0020 _wsseq_rx = re.compile(r"[ \t\n]+", re.U)
0021
0022 def simplify (s):
0023     """
0024     Simplify ASCII whitespace in the string.
0025
0026     All leading and trailing ASCII whitespace are removed,
0027     all inner ASCII whitespace sequences are replaced with space.
0028
0029     @param s: string to normalize
0030     @type s: string
0031
0032     @returns: normalized string
0033     @rtype: string
0034     """
0035
0036     return _wsseq_rx.sub(" ", s.strip())
0037
0038
0039 _uwsseq_rx = re.compile(r"\s+", re.U)
0040
0041 def usimplify (s):
0042     """
0043     Simplify whitespace in the string.
0044
0045     Like L{simplify}, but takes into account all whitespace defined by Unicode.
0046
0047     @param s: string to normalize
0048     @type s: string
0049
0050     @returns: normalized string
0051     @rtype: string
0052     """
0053
0054     return _uwsseq_rx.sub(" ", s.strip())
0055
0056
0057 def shrink (s):
0058     """
0059     Remove all whitespace from the string.
0060
0061     @param s: string to normalize
0062     @type s: string
0063
0064     @returns: normalized string
0065     @rtype: string
0066     """
0067
0068     return _uwsseq_rx.sub("", s)
0069
0070
0071 def tighten (s):
0072     """
0073     Remove all whitespace and lowercase the string.
0074
0075     @param s: string to normalize
0076     @type s: string
0077
0078     @returns: normalized string
0079     @rtype: string
0080     """
0081
0082     return _uwsseq_rx.sub("", s.lower())
0083
0084
0085 _non_ascii_ident_rx = re.compile(r"[^a-z0-9_]", re.U|re.I)
0086
0087 def identify (s):
0088     """
0089     Construct an uniform-case ASCII-identifier out of the string.
0090
0091     ASCII-identifier is constructed in the following order:
0092       - string is decomposed into Unicode NFKD
0093       - string is lowercased
0094       - every character that is neither an ASCII alphanumeric nor
0095         the underscore is removed
0096       - if the string starts with a digit, underscore is prepended
0097
0098     @param s: string to normalize
0099     @type s: string
0100
0101     @returns: normalized string
0102     @rtype: string
0103     """
0104
0105     ns = s
0106
0107     # Decompose.
0108     ns = unicodedata.normalize("NFKD", ns)
0109
0110     # Lowercase.
0111     ns = ns.lower()
0112
0113     # Remove non-identifier chars.
0114     ns = _non_ascii_ident_rx.sub("", ns)
0115
0116     # Prefix with underscore if first char is digit.
0117     if ns[0:1].isdigit():
0118         ns = "_" + ns
0119
0120     return ns
0121
0122
0123 def xentitize (s):
0124     """
0125     Replace characters having default XML entities with the entities.
0126
0127     The replacements are:
0128       - C{&amp;} for ampersand
0129       - C{&lt} and C{&gt;} for less-than and greater-then signs
0130       - C{&apos;} and C{&quot;} for ASCII single and double quotes
0131
0132     @param s: string to normalize
0133     @type s: string
0134
0135     @returns: normalized string
0136     @rtype: string
0137     """
0138
0139     ns = s
0140     ns = ns.replace("&", "&amp;") # must come first
0141     ns = ns.replace("<", "&lt;")
0142     ns = ns.replace(">", "&gt;")
0143     ns = ns.replace("'", "&apos;")
0144     ns = ns.replace('"', "&quot;")
0145
0146     return ns
0147
0148
0149 # As defined by http://www.unicode.org/faq/unsup_char.html.
0150 _invisible_character_codepoints = ([]
0151     + [0x200C, 0x200D] # cursive joiners
0152     + list(range(0x202A, 0x202E + 1)) # bidirectional format controls
0153     + [0x00AD] # soft hyphen
0154     + [0x2060, 0xFEFF] # word joiners
0155     + [0x200B] # the zero width space
0156     + list(range(0x2061, 0x2064 + 1)) # invisible math operators
0157     + [0x115F, 0x1160] # Jamo filler characters
0158     + list(range(0xFE00, 0xFE0F + 1)) # variation selectors
0159 )
0160 _invchstr = "".join(map(chr, _invisible_character_codepoints))
0161 _invisible_character_replrx = re.compile("[%s]" % _invchstr, re.U)
0162
0163 def noinvisible (s):
0164     """
0165     Remove all invisible characters from the string.
0166
0167     Invisible characters are those which have zero width,
0168     i.e. do not have any visual representation in the text
0169     (when the text is rendered proportionally).
0170     See U{http://www.unicode.org/faq/unsup_char.html} for the list
0171     of these characters as defined by Unicode.
0172
0173     @param s: string to normalize
0174     @type s: string
0175
0176     @returns: normalized string
0177     @rtype: string
0178     """
0179
0180     ns = _invisible_character_replrx.sub("", s)
0181     return ns
0182
0183
0184 def demangle_srcrefs (collsrcs=None, collsrcmap=None, truesrcheads=None,
0185                       compexts=None):
0186     """
0187     Resolve source references in message created by intermediate extraction
0188     [hook factory].
0189
0190     Sometimes the messages from a source file in the format not known
0191     to C{xgettext(1)} are first extracted by a preextraction tool into
0192     a format known to C{xgettext}, and then by C{xgettext} to PO template.
0193     This is the intermediate extraction, and the files that C{xgettext}
0194     gets to operate on are intermediate files.
0195
0196     When intermediate extraction is performed, the source references in
0197     the resulting PO template are going to be "mangled", pointing to
0198     the intermediate files rather than to the true source files.
0199     This hook factory will produce a function that will resolve
0200     intermediate into true source reference, "demangle" them, where possible.
0201
0202     One mode of intermediate extraction is to extract multiple sources
0203     into a collective intermediate file. This file may have standardized
0204     name throughout a collection of catalogs, or it may be special
0205     by catalog. For demangling to be possible in this case,
0206     the preextraction tool has to provide true source references
0207     in the extracted comments (C{#.}) of the messages.
0208     When that is the case, parameter C{collsrcs} is used to specify
0209     the sequence of names of generally known intermediate files,
0210     parameter C{collsrcmap} of those specific by catalog
0211     (as dictionary of catalog name to sequence of intermediate file names),
0212     and parameter C{truesrcheads} specifies the sequence of initial strings
0213     in extracted comments which are followed by the true source reference.
0214     (If C{truesrcheads} is C{None} or empty, this mode of demangling
0215     is disabled.)
0216
0217     For example, collective-intermediate extraction::
0218
0219         #. file: apples.clt:156
0220         #: resources.cpp:328
0221         msgid "Granny Smith"
0222         msgstr ""
0223
0224         #. file: peaches.clt:49
0225         #: resources.cpp:2672
0226         msgid "Redhaven"
0227         msgstr ""
0228
0229     is demangled by setting C{collsrcs=["resources.cpp"]}
0230     and C{truesrcheads=["file:"]}.
0231
0232     Another mode of intermediate extraction is to for each source file
0233     to be extracted into a single paired intermediate file,
0234     which is named same as the true source plus an additional extension.
0235     In this mode, parameter C{compexts} specifies the list of known
0236     composite extensions (including the leading dot), which
0237     will be demangled by stripping the final extension from the path.
0238
0239     For example, paired-intermediate extraction::
0240
0241         #: apples.clt.h:156
0242         msgid "Granny Smith"
0243         msgstr ""
0244
0245         #: peaches.clt.h:49
0246         msgid "Redhaven"
0247         msgstr ""
0248
0249     is demangled by setting C{compexts=[".clt.h"]}.
0250
0251     @param collsrcs: general intermediate file names
0252     @type collsrcs: <string*>
0253     @param collsrcmap: catalog-specific intermediate file names
0254     @type collsrcmap: {string: <string*>*}
0255     @param truesrcheads: prefixes to true file references in comments
0256     @type truesrcheads: <string*>
0257     @param compexts: composite intermediate file extensions
0258     @type compexts: <string*>
0259
0260     @return: type F4A hook
0261     @rtype: C{(cat, msg) -> numerr}
0262     """
0263
0264     def hook (msg, cat):
0265
0266         numerr = 0
0267
0268         truerefs = []
0269
0270         # Demangle source references in collective-intermediate mode
0271         if truesrcheads:
0272             # Collect source references from extracted comments.
0273             cmnts = []
0274             for cmnt in msg.auto_comment:
0275                 hasrefs = False
0276                 for head in truesrcheads:
0277                     if cmnt.startswith(head):
0278                         refs = [x.split(":")
0279                                 for x in cmnt[len(head):].split()]
0280                         hasrefs = all((len(x) == 2 and x[1].isdigit)
0281                                         for x in refs)
0282                         if not hasrefs:
0283                             numerr += 1
0284                         break
0285                 if hasrefs:
0286                     refs = [(path, int(lno)) for path, lno in refs]
0287                     truerefs.extend(refs)
0288                 else:
0289                     cmnts.append(cmnt)
0290             msg.auto_comment[:] = cmnts
0291
0292             # Exclude intermediates from source references.
0293             for path, lno in msg.source:
0294                 bname = os.path.basename(path)
0295                 if (not (   (collsrcs and bname in collsrcs)
0296                          or (    collsrcmap
0297                              and bname in collsrcmap.get(cat.name, {})))
0298                 ):
0299                     truerefs.append((path, lno))
0300
0301         # Demangle source references in paired-intermediate mode
0302         if compexts:
0303             for path, lno in msg.source:
0304                 for ext in compexts:
0305                     if path.endswith(ext):
0306                         p = path.rfind(".")
0307                         if p > 0:
0308                             path = path[:p]
0309                         else:
0310                             numerr += 1
0311                         break
0312                 truerefs.append((path, lno))
0313
0314         if isinstance(msg, MessageUnsafe):
0315             msg.source = truerefs
0316         else:
0317             msg.source = Monlist(list(map(Monpair, truerefs)))
0318
0319         return numerr
0320
0321     return hook
0322
0323
0324 def uniq_source (msg, cat):
0325     """
0326     Make message source references unique [type F4A hook].
0327
0328     Sometimes source references of a message can be non-unique
0329     due to particularities of extraction or later processing.
0330     This hook makes them unique, while preserving the ordering.
0331     """
0332
0333     uniqrefs = []
0334     for path, line in msg.source:
0335         ref = (os.path.normpath(path), line)
0336         if ref not in uniqrefs:
0337             uniqrefs.append(ref)
0338
0339     if isinstance(msg, MessageUnsafe):
0340         msg.source = uniqrefs
0341     else:
0342         msg.source = Monlist(list(map(Monpair, uniqrefs)))
0343
0344
0345
0346 def uniq_auto_comment (onlyheads=None):
0347     """
0348     Remove non-unique automatic comment lines in message [hook factory].
0349
0350     Sometimes the message extraction tool adds automatic comments
0351     to provide more context for the message
0352     (for example, XML tag path to the current message).
0353     If the message is found more than once in the same context,
0354     such comment lines get repeated.
0355     This hook can be used to make auto comment lines unique;
0356     either fully, or only those with certain prefixes given
0357     by C{onlyheads} parameter.
0358
0359     @param onlyheads: prefixes of comment lines which should be made unique
0360     @type onlyheads: <string*>
0361
0362     @return: type F4A hook
0363     @rtype: C{(cat, msg) -> numerr}
0364     """
0365
0366     if onlyheads is not None and not isinstance(onlyheads, tuple):
0367         onlyheads = tuple(onlyheads)
0368
0369     def hook (msg, cat):
0370
0371         seen_cmnts = set()
0372         cmnts = []
0373         for cmnt in msg.auto_comment:
0374             if onlyheads is None or cmnt.startswith(onlyheads):
0375                 if cmnt not in seen_cmnts:
0376                     cmnts.append(cmnt)
0377                     seen_cmnts.add(cmnt)
0378             else:
0379                 cmnts.append(cmnt)
0380         msg.auto_comment[:] = cmnts
0381
0382     return hook
0383
0384
0385 def canonical_header (hdr, cat):
0386     """
0387     Check and rearrange content of a PO header into canonical form
0388     [type F4B hook].
0389
0390     @return: number of errors
0391     @rtype: int
0392     """
0393
0394     nerr = 0
0395
0396     nerr += _fix_authors(hdr, cat)
0397
0398     return nerr
0399
0400
0401 _yr1_rx = re.compile(r"^\s*(\d{4}|\d{2})\s*$")
0402 _yr2_rx = re.compile(r"^\s*(\d{4}|\d{2})\s*[-—–]\s*(\d{4}|\d{2})\s*$")
0403
0404 def _fix_authors (hdr, cat):
0405
0406     nerr = 0
0407
0408     # Parse authors data from the header.
0409     authors = {}
0410     problems = False
0411     pos = 0
0412     for a in hdr.author:
0413         pos += 1
0414
0415         m = re.search(r"(.*?)<(.*?)>(.*)$", a)
0416         if not m:
0417             warning(_("@info",
0418                       "%(file)s: Cannot parse name and email address "
0419                       "from translator comment '%(cmnt)s'.",
0420                       file=cat.filename, cmnt=a))
0421             problems = True
0422             nerr += 1
0423             continue
0424         name, email, rest = m.groups()
0425         name = simplify(name)
0426         email = simplify(email)
0427
0428         m = re.search(r"^\s*,(.+?)\.?\s*$", rest)
0429         if not m:
0430             warning(_("@info",
0431                       "%(file)s: Missing years in "
0432                       "translator comment '%(cmnt)s'.",
0433                       file=cat.filename, cmnt=a))
0434             problems = True
0435             nerr += 1
0436             continue
0437         yearstr = m.group(1)
0438
0439         years = []
0440         for yspec in yearstr.split(","):
0441             m = _yr1_rx.search(yspec) or _yr2_rx.search(yspec)
0442             if not m:
0443                 warning(_("@info",
0444                           "%(file)s: Cannot parse years in "
0445                           "translator comment '%(cmnt)s'.",
0446                           file=cat.filename, cmnt=a))
0447                 problems = True
0448                 nerr += 1
0449                 break
0450             if len(m.groups()) == 1:
0451                 ystr = m.group(1)
0452                 if len(ystr) == 2:
0453                     ystr = (ystr[0] == "9" and "19" or "20") + ystr
0454                 years.append(int(ystr))
0455             else:
0456                 years.extend(list(range(int(m.group(1)), int(m.group(2)) + 1)))
0457         if not years:
0458             continue
0459
0460         if name not in authors:
0461             authors[name] = {"email": "", "pos": 0, "years": set()}
0462         authors[name]["email"] = email
0463         authors[name]["pos"] = pos
0464         authors[name]["years"].update(years)
0465
0466     # If there were any problems, do not touch author comments.
0467     if problems:
0468         return nerr
0469
0470     # Post-process authors data.
0471     authlst = []
0472     for name, adata in list(authors.items()):
0473         adata["years"] = list(adata["years"])
0474         adata["years"].sort()
0475         adata["years"] = list(map(str, adata["years"]))
0476         adata["name"] = name
0477         authlst.append(adata)
0478
0479     authlst.sort(key=lambda x: (min(x["years"]), x["pos"]))
0480
0481     # Construct new author comments.
0482     authcmnts = Monlist()
0483     for a in authlst:
0484         acmnt = "%s <%s>, %s." % (a["name"], a["email"],
0485                                     ", ".join(a["years"]))
0486         authcmnts.append(acmnt)
0487
0488     hdr.author = authcmnts
0489
0490     return nerr
0491