File indexing completed on 2024-10-27 08:25:04
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Various normalizations for strings and PO elements. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 import os 0011 import re 0012 import unicodedata 0013 0014 from pology import _, n_ 0015 from pology.message import MessageUnsafe 0016 from pology.monitored import Monlist, Monpair 0017 from pology.report import warning 0018 0019 0020 _wsseq_rx = re.compile(r"[ \t\n]+", re.U) 0021 0022 def simplify (s): 0023 """ 0024 Simplify ASCII whitespace in the string. 0025 0026 All leading and trailing ASCII whitespace are removed, 0027 all inner ASCII whitespace sequences are replaced with space. 0028 0029 @param s: string to normalize 0030 @type s: string 0031 0032 @returns: normalized string 0033 @rtype: string 0034 """ 0035 0036 return _wsseq_rx.sub(" ", s.strip()) 0037 0038 0039 _uwsseq_rx = re.compile(r"\s+", re.U) 0040 0041 def usimplify (s): 0042 """ 0043 Simplify whitespace in the string. 0044 0045 Like L{simplify}, but takes into account all whitespace defined by Unicode. 0046 0047 @param s: string to normalize 0048 @type s: string 0049 0050 @returns: normalized string 0051 @rtype: string 0052 """ 0053 0054 return _uwsseq_rx.sub(" ", s.strip()) 0055 0056 0057 def shrink (s): 0058 """ 0059 Remove all whitespace from the string. 0060 0061 @param s: string to normalize 0062 @type s: string 0063 0064 @returns: normalized string 0065 @rtype: string 0066 """ 0067 0068 return _uwsseq_rx.sub("", s) 0069 0070 0071 def tighten (s): 0072 """ 0073 Remove all whitespace and lowercase the string. 0074 0075 @param s: string to normalize 0076 @type s: string 0077 0078 @returns: normalized string 0079 @rtype: string 0080 """ 0081 0082 return _uwsseq_rx.sub("", s.lower()) 0083 0084 0085 _non_ascii_ident_rx = re.compile(r"[^a-z0-9_]", re.U|re.I) 0086 0087 def identify (s): 0088 """ 0089 Construct an uniform-case ASCII-identifier out of the string. 0090 0091 ASCII-identifier is constructed in the following order: 0092 - string is decomposed into Unicode NFKD 0093 - string is lowercased 0094 - every character that is neither an ASCII alphanumeric nor 0095 the underscore is removed 0096 - if the string starts with a digit, underscore is prepended 0097 0098 @param s: string to normalize 0099 @type s: string 0100 0101 @returns: normalized string 0102 @rtype: string 0103 """ 0104 0105 ns = s 0106 0107 # Decompose. 0108 ns = unicodedata.normalize("NFKD", ns) 0109 0110 # Lowercase. 0111 ns = ns.lower() 0112 0113 # Remove non-identifier chars. 0114 ns = _non_ascii_ident_rx.sub("", ns) 0115 0116 # Prefix with underscore if first char is digit. 0117 if ns[0:1].isdigit(): 0118 ns = "_" + ns 0119 0120 return ns 0121 0122 0123 def xentitize (s): 0124 """ 0125 Replace characters having default XML entities with the entities. 0126 0127 The replacements are: 0128 - C{&} for ampersand 0129 - C{<} and C{>} for less-than and greater-then signs 0130 - C{'} and C{"} for ASCII single and double quotes 0131 0132 @param s: string to normalize 0133 @type s: string 0134 0135 @returns: normalized string 0136 @rtype: string 0137 """ 0138 0139 ns = s 0140 ns = ns.replace("&", "&") # must come first 0141 ns = ns.replace("<", "<") 0142 ns = ns.replace(">", ">") 0143 ns = ns.replace("'", "'") 0144 ns = ns.replace('"', """) 0145 0146 return ns 0147 0148 0149 # As defined by http://www.unicode.org/faq/unsup_char.html. 0150 _invisible_character_codepoints = ([] 0151 + [0x200C, 0x200D] # cursive joiners 0152 + list(range(0x202A, 0x202E + 1)) # bidirectional format controls 0153 + [0x00AD] # soft hyphen 0154 + [0x2060, 0xFEFF] # word joiners 0155 + [0x200B] # the zero width space 0156 + list(range(0x2061, 0x2064 + 1)) # invisible math operators 0157 + [0x115F, 0x1160] # Jamo filler characters 0158 + list(range(0xFE00, 0xFE0F + 1)) # variation selectors 0159 ) 0160 _invchstr = "".join(map(chr, _invisible_character_codepoints)) 0161 _invisible_character_replrx = re.compile("[%s]" % _invchstr, re.U) 0162 0163 def noinvisible (s): 0164 """ 0165 Remove all invisible characters from the string. 0166 0167 Invisible characters are those which have zero width, 0168 i.e. do not have any visual representation in the text 0169 (when the text is rendered proportionally). 0170 See U{http://www.unicode.org/faq/unsup_char.html} for the list 0171 of these characters as defined by Unicode. 0172 0173 @param s: string to normalize 0174 @type s: string 0175 0176 @returns: normalized string 0177 @rtype: string 0178 """ 0179 0180 ns = _invisible_character_replrx.sub("", s) 0181 return ns 0182 0183 0184 def demangle_srcrefs (collsrcs=None, collsrcmap=None, truesrcheads=None, 0185 compexts=None): 0186 """ 0187 Resolve source references in message created by intermediate extraction 0188 [hook factory]. 0189 0190 Sometimes the messages from a source file in the format not known 0191 to C{xgettext(1)} are first extracted by a preextraction tool into 0192 a format known to C{xgettext}, and then by C{xgettext} to PO template. 0193 This is the intermediate extraction, and the files that C{xgettext} 0194 gets to operate on are intermediate files. 0195 0196 When intermediate extraction is performed, the source references in 0197 the resulting PO template are going to be "mangled", pointing to 0198 the intermediate files rather than to the true source files. 0199 This hook factory will produce a function that will resolve 0200 intermediate into true source reference, "demangle" them, where possible. 0201 0202 One mode of intermediate extraction is to extract multiple sources 0203 into a collective intermediate file. This file may have standardized 0204 name throughout a collection of catalogs, or it may be special 0205 by catalog. For demangling to be possible in this case, 0206 the preextraction tool has to provide true source references 0207 in the extracted comments (C{#.}) of the messages. 0208 When that is the case, parameter C{collsrcs} is used to specify 0209 the sequence of names of generally known intermediate files, 0210 parameter C{collsrcmap} of those specific by catalog 0211 (as dictionary of catalog name to sequence of intermediate file names), 0212 and parameter C{truesrcheads} specifies the sequence of initial strings 0213 in extracted comments which are followed by the true source reference. 0214 (If C{truesrcheads} is C{None} or empty, this mode of demangling 0215 is disabled.) 0216 0217 For example, collective-intermediate extraction:: 0218 0219 #. file: apples.clt:156 0220 #: resources.cpp:328 0221 msgid "Granny Smith" 0222 msgstr "" 0223 0224 #. file: peaches.clt:49 0225 #: resources.cpp:2672 0226 msgid "Redhaven" 0227 msgstr "" 0228 0229 is demangled by setting C{collsrcs=["resources.cpp"]} 0230 and C{truesrcheads=["file:"]}. 0231 0232 Another mode of intermediate extraction is to for each source file 0233 to be extracted into a single paired intermediate file, 0234 which is named same as the true source plus an additional extension. 0235 In this mode, parameter C{compexts} specifies the list of known 0236 composite extensions (including the leading dot), which 0237 will be demangled by stripping the final extension from the path. 0238 0239 For example, paired-intermediate extraction:: 0240 0241 #: apples.clt.h:156 0242 msgid "Granny Smith" 0243 msgstr "" 0244 0245 #: peaches.clt.h:49 0246 msgid "Redhaven" 0247 msgstr "" 0248 0249 is demangled by setting C{compexts=[".clt.h"]}. 0250 0251 @param collsrcs: general intermediate file names 0252 @type collsrcs: <string*> 0253 @param collsrcmap: catalog-specific intermediate file names 0254 @type collsrcmap: {string: <string*>*} 0255 @param truesrcheads: prefixes to true file references in comments 0256 @type truesrcheads: <string*> 0257 @param compexts: composite intermediate file extensions 0258 @type compexts: <string*> 0259 0260 @return: type F4A hook 0261 @rtype: C{(cat, msg) -> numerr} 0262 """ 0263 0264 def hook (msg, cat): 0265 0266 numerr = 0 0267 0268 truerefs = [] 0269 0270 # Demangle source references in collective-intermediate mode 0271 if truesrcheads: 0272 # Collect source references from extracted comments. 0273 cmnts = [] 0274 for cmnt in msg.auto_comment: 0275 hasrefs = False 0276 for head in truesrcheads: 0277 if cmnt.startswith(head): 0278 refs = [x.split(":") 0279 for x in cmnt[len(head):].split()] 0280 hasrefs = all((len(x) == 2 and x[1].isdigit) 0281 for x in refs) 0282 if not hasrefs: 0283 numerr += 1 0284 break 0285 if hasrefs: 0286 refs = [(path, int(lno)) for path, lno in refs] 0287 truerefs.extend(refs) 0288 else: 0289 cmnts.append(cmnt) 0290 msg.auto_comment[:] = cmnts 0291 0292 # Exclude intermediates from source references. 0293 for path, lno in msg.source: 0294 bname = os.path.basename(path) 0295 if (not ( (collsrcs and bname in collsrcs) 0296 or ( collsrcmap 0297 and bname in collsrcmap.get(cat.name, {}))) 0298 ): 0299 truerefs.append((path, lno)) 0300 0301 # Demangle source references in paired-intermediate mode 0302 if compexts: 0303 for path, lno in msg.source: 0304 for ext in compexts: 0305 if path.endswith(ext): 0306 p = path.rfind(".") 0307 if p > 0: 0308 path = path[:p] 0309 else: 0310 numerr += 1 0311 break 0312 truerefs.append((path, lno)) 0313 0314 if isinstance(msg, MessageUnsafe): 0315 msg.source = truerefs 0316 else: 0317 msg.source = Monlist(list(map(Monpair, truerefs))) 0318 0319 return numerr 0320 0321 return hook 0322 0323 0324 def uniq_source (msg, cat): 0325 """ 0326 Make message source references unique [type F4A hook]. 0327 0328 Sometimes source references of a message can be non-unique 0329 due to particularities of extraction or later processing. 0330 This hook makes them unique, while preserving the ordering. 0331 """ 0332 0333 uniqrefs = [] 0334 for path, line in msg.source: 0335 ref = (os.path.normpath(path), line) 0336 if ref not in uniqrefs: 0337 uniqrefs.append(ref) 0338 0339 if isinstance(msg, MessageUnsafe): 0340 msg.source = uniqrefs 0341 else: 0342 msg.source = Monlist(list(map(Monpair, uniqrefs))) 0343 0344 0345 0346 def uniq_auto_comment (onlyheads=None): 0347 """ 0348 Remove non-unique automatic comment lines in message [hook factory]. 0349 0350 Sometimes the message extraction tool adds automatic comments 0351 to provide more context for the message 0352 (for example, XML tag path to the current message). 0353 If the message is found more than once in the same context, 0354 such comment lines get repeated. 0355 This hook can be used to make auto comment lines unique; 0356 either fully, or only those with certain prefixes given 0357 by C{onlyheads} parameter. 0358 0359 @param onlyheads: prefixes of comment lines which should be made unique 0360 @type onlyheads: <string*> 0361 0362 @return: type F4A hook 0363 @rtype: C{(cat, msg) -> numerr} 0364 """ 0365 0366 if onlyheads is not None and not isinstance(onlyheads, tuple): 0367 onlyheads = tuple(onlyheads) 0368 0369 def hook (msg, cat): 0370 0371 seen_cmnts = set() 0372 cmnts = [] 0373 for cmnt in msg.auto_comment: 0374 if onlyheads is None or cmnt.startswith(onlyheads): 0375 if cmnt not in seen_cmnts: 0376 cmnts.append(cmnt) 0377 seen_cmnts.add(cmnt) 0378 else: 0379 cmnts.append(cmnt) 0380 msg.auto_comment[:] = cmnts 0381 0382 return hook 0383 0384 0385 def canonical_header (hdr, cat): 0386 """ 0387 Check and rearrange content of a PO header into canonical form 0388 [type F4B hook]. 0389 0390 @return: number of errors 0391 @rtype: int 0392 """ 0393 0394 nerr = 0 0395 0396 nerr += _fix_authors(hdr, cat) 0397 0398 return nerr 0399 0400 0401 _yr1_rx = re.compile(r"^\s*(\d{4}|\d{2})\s*$") 0402 _yr2_rx = re.compile(r"^\s*(\d{4}|\d{2})\s*[-—–]\s*(\d{4}|\d{2})\s*$") 0403 0404 def _fix_authors (hdr, cat): 0405 0406 nerr = 0 0407 0408 # Parse authors data from the header. 0409 authors = {} 0410 problems = False 0411 pos = 0 0412 for a in hdr.author: 0413 pos += 1 0414 0415 m = re.search(r"(.*?)<(.*?)>(.*)$", a) 0416 if not m: 0417 warning(_("@info", 0418 "%(file)s: Cannot parse name and email address " 0419 "from translator comment '%(cmnt)s'.", 0420 file=cat.filename, cmnt=a)) 0421 problems = True 0422 nerr += 1 0423 continue 0424 name, email, rest = m.groups() 0425 name = simplify(name) 0426 email = simplify(email) 0427 0428 m = re.search(r"^\s*,(.+?)\.?\s*$", rest) 0429 if not m: 0430 warning(_("@info", 0431 "%(file)s: Missing years in " 0432 "translator comment '%(cmnt)s'.", 0433 file=cat.filename, cmnt=a)) 0434 problems = True 0435 nerr += 1 0436 continue 0437 yearstr = m.group(1) 0438 0439 years = [] 0440 for yspec in yearstr.split(","): 0441 m = _yr1_rx.search(yspec) or _yr2_rx.search(yspec) 0442 if not m: 0443 warning(_("@info", 0444 "%(file)s: Cannot parse years in " 0445 "translator comment '%(cmnt)s'.", 0446 file=cat.filename, cmnt=a)) 0447 problems = True 0448 nerr += 1 0449 break 0450 if len(m.groups()) == 1: 0451 ystr = m.group(1) 0452 if len(ystr) == 2: 0453 ystr = (ystr[0] == "9" and "19" or "20") + ystr 0454 years.append(int(ystr)) 0455 else: 0456 years.extend(list(range(int(m.group(1)), int(m.group(2)) + 1))) 0457 if not years: 0458 continue 0459 0460 if name not in authors: 0461 authors[name] = {"email": "", "pos": 0, "years": set()} 0462 authors[name]["email"] = email 0463 authors[name]["pos"] = pos 0464 authors[name]["years"].update(years) 0465 0466 # If there were any problems, do not touch author comments. 0467 if problems: 0468 return nerr 0469 0470 # Post-process authors data. 0471 authlst = [] 0472 for name, adata in list(authors.items()): 0473 adata["years"] = list(adata["years"]) 0474 adata["years"].sort() 0475 adata["years"] = list(map(str, adata["years"])) 0476 adata["name"] = name 0477 authlst.append(adata) 0478 0479 authlst.sort(key=lambda x: (min(x["years"]), x["pos"])) 0480 0481 # Construct new author comments. 0482 authcmnts = Monlist() 0483 for a in authlst: 0484 acmnt = "%s <%s>, %s." % (a["name"], a["email"], 0485 ", ".join(a["years"])) 0486 authcmnts.append(acmnt) 0487 0488 hdr.author = authcmnts 0489 0490 return nerr 0491