File indexing completed on 2024-11-03 05:12:54
0001 # -*- coding: UTF-8 -* 0002 0003 """ 0004 Produce special diffs between strings and other interesting objects. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 from difflib import SequenceMatcher 0011 import random 0012 import re 0013 0014 from pology import PologyError, _, n_ 0015 from pology.colors import ColorString, cjoin 0016 from pology.message import MessageUnsafe 0017 from pology.report import error 0018 from pology.split import split_text 0019 0020 0021 _new_tag = "+" 0022 0023 _new_vtag = "+" 0024 _new_opnc = "{" 0025 _new_clsc = "}" 0026 0027 _old_tag = "-" 0028 0029 _old_vtag = "-" 0030 _old_opnc = "{" 0031 _old_clsc = "}" 0032 0033 _equ_tag = " " 0034 0035 _tagext_none = "~" 0036 _tagext_none_len = len(_tagext_none) 0037 0038 _new_opn = _new_opnc + _new_vtag 0039 _new_cls = _new_vtag + _new_clsc 0040 _old_opn = _old_opnc + _old_vtag 0041 _old_cls = _old_vtag + _old_clsc 0042 _all_wrappers = set((_new_opn, _new_cls, _old_opn, _old_cls)) 0043 0044 _tmp_wr = (_new_vtag, _new_opnc, _new_clsc, _old_vtag, _old_opnc, _old_clsc) 0045 _tmp_wrlen = list(map(len, _tmp_wr)) 0046 if max(_tmp_wrlen) != 1 or min(_tmp_wrlen) != 1: 0047 error(_("@info \"ediff\" is shorthand for \"embedded difference\"", 0048 "All ediff wrapper elements must be of unit length.")) 0049 0050 0051 class _Sequence_diff_wrapper: 0052 0053 def __init__ (self, obj, reductf=None): 0054 self.obj = obj 0055 self._robj = (reductf or (lambda x: x))(obj) 0056 0057 def __hash__ (self): 0058 return hash(self._robj) 0059 0060 def __iter__ (self): 0061 return iter(self._robj) 0062 0063 def __eq__ (self, other): 0064 return self._robj == other._robj 0065 0066 0067 def tdiff (seq_old, seq_new, reductf=None, diffr=False): 0068 """ 0069 Create tagged difference of two sequences. 0070 0071 Difference is presented as a list of tuples, 0072 with each tuple composed of a difference tag and a sequence element. 0073 Difference tag is string C{"+"}, C{"-"}, or C{" "}, for elements which 0074 belong to the old, the new, or to both sequences, respectively. 0075 0076 The list is ordered such that collecting all elements not tagged 0077 as old will reconstruct the new sequence, and collecting all not tagged 0078 as new will reconstruct the old sequence. 0079 0080 If requested by the C{diffr} parameter, also reported is the 0081 I{difference ratio}, a heuristic measure of difference between two texts. 0082 0.0 means no difference, and 1.0 that sequences are completely different. 0083 0084 Examples:: 0085 0086 >>> s1 = "A type of foo".split() 0087 >>> s2 = "A kind of foo".split() 0088 >>> tdiff(s1, s2) 0089 [(' ', 'A'), ('-', 'type'), ('+', 'kind'), (' ', 'of'), (' ', 'foo')] 0090 >>> tdiff(s1, s2, diffr=True) 0091 ([(' ', 'A'), ('-', 'type'), ('+', 'kind'), (' ', 'of'), (' ', 'foo')], 0092 0.25) 0093 0094 To be able to diff them, sequence elements only need to be hashable. 0095 However, for compound elements it may be better to diff them 0096 only by some subset of data, e.g. by one of their string attributes. 0097 Parameter C{reductf} can be used to specify a reduction function, which 0098 will be called on each element to produce its diffing representative. 0099 0100 @param seq_old: sequence to diff from 0101 @type seq_old: sequence with hashable elements 0102 @param seq_new: sequence to diff to 0103 @type seq_new: sequence with hashable elements 0104 @param reductf: function to produce diffing representatives 0105 @type reductf: (sequence element) -> diffing representative 0106 @param diffr: whether to report difference ratio 0107 @type diffr: bool 0108 0109 @returns: difference list and possibly difference ratio 0110 @rtype: [(string, element)...] or ([(string, element)...], float) 0111 """ 0112 0113 if reductf is not None: 0114 seq_old = [_Sequence_diff_wrapper(x, reductf) for x in seq_old] 0115 seq_new = [_Sequence_diff_wrapper(x, reductf) for x in seq_new] 0116 0117 dlist = [] 0118 seqmatch = SequenceMatcher(None, seq_old, seq_new) 0119 opcodes = seqmatch.get_opcodes() 0120 if diffr: 0121 dr = 1.0 - seqmatch.ratio() 0122 for opcode, i1, i2, j1, j2 in opcodes: 0123 if opcode == "equal": 0124 dlist.extend([(_equ_tag, el) for el in seq_old[i1:i2]]) 0125 elif opcode == "replace": 0126 dlist.extend([(_old_tag, el) for el in seq_old[i1:i2]]) 0127 dlist.extend([(_new_tag, el) for el in seq_new[j1:j2]]) 0128 elif opcode == "delete": 0129 dlist.extend([(_old_tag, el) for el in seq_old[i1:i2]]) 0130 elif opcode == "insert": 0131 dlist.extend([(_new_tag, el) for el in seq_new[j1:j2]]) 0132 else: 0133 raise PologyError( 0134 _("@info \"opcode\" is shorthand for \"operation code\"", 0135 "Unknown opcode '%(code)s' from sequence matcher.", 0136 code=opcode)) 0137 0138 if reductf is not None: 0139 dlist = [(tag, el.obj) for tag, el in dlist] 0140 0141 return diffr and (dlist, dr) or dlist 0142 0143 0144 def itdiff (seq_old, seq_new, reductf=None, cutoff=0.6, diffr=False): 0145 """ 0146 Create interleaved tagged difference of two sequences. 0147 0148 Similar to L{tdiff}, except that blocks of added/removed elements 0149 are further heuristically interleaved by similarity, such that 0150 each removed element may be followed by a similar added element, 0151 if such has been determined. 0152 This is useful e.g. to be able to afterwards make inner difference 0153 of each two paired similar elements (e.g. word diff within line diff). 0154 0155 Example:: 0156 0157 >>> s1 = "Two blue airplanes".split() 0158 >>> s2 = "Two bluish ships".split() 0159 >>> tdiff(s1, s2) 0160 [(' ', 'Two'), ('-', 'blue'), ('-', 'airplanes'), ('+', 'bluish'), 0161 ('+', 'ships')] 0162 >>> itdiff(s1, s2) 0163 [(' ', 'Two'), ('-', 'blue'), ('+', 'bluish'), ('-', 'airplanes'), 0164 ('+', 'ships')] 0165 0166 To be able to interleave blocks, each element in turn must be 0167 a sequence in its own. This means that function supplied by C{reductf}, 0168 otherwise of same semantics as in L{tdiff}, here must also produce 0169 a sequence as diffing representative (e.g. a string). 0170 0171 Parameter C{cutoff} states the minimal similarity between 0172 two elements needed for them to be considered similar at all. 0173 0174 @param seq_old: sequence to diff from 0175 @type seq_old: sequence with hashable elements 0176 @param seq_new: sequence to diff to 0177 @type seq_new: sequence with hashable elements 0178 @param reductf: function to produce diffing representatives 0179 @type reductf: (sequence element) -> representative sequence 0180 @param cutoff: minimal similarity to consider elements similar 0181 @type cutoff: float [0, 1] 0182 @param diffr: whether to report difference ratio 0183 @type diffr: bool 0184 0185 @returns: interleaved difference list and possibly difference ratio 0186 @rtype: [(string, element)...] or ([(string, element)...], float) 0187 """ 0188 0189 dres = tdiff(seq_old, seq_new, reductf=reductf, diffr=diffr) 0190 if diffr: 0191 dlist, dr = dres 0192 else: 0193 dlist = dres 0194 lendl = len(dlist) 0195 idlist = [] 0196 i = 0 0197 while True: 0198 while i < lendl and dlist[i][0] == _equ_tag: 0199 idlist.append(dlist[i]) 0200 i += 1 0201 if i >= lendl: 0202 break 0203 els_old = [] 0204 els_new = [] 0205 while i < lendl and dlist[i][0] != _equ_tag: 0206 if dlist[i][0] == _old_tag: 0207 els_old.append(dlist[i][1]) 0208 else: 0209 els_new.append(dlist[i][1]) 0210 i += 1 0211 if els_old and els_new: 0212 idlist.extend(_dinterleave(els_old, els_new, reductf, cutoff)) 0213 else: 0214 idlist.extend([(_old_tag, x) for x in els_old]) 0215 idlist.extend([(_new_tag, x) for x in els_new]) 0216 0217 return diffr and (idlist, dr) or idlist 0218 0219 0220 def _dinterleave (els_old, els_new, reductf, cutoff): 0221 0222 reductf = reductf or (lambda x: x) 0223 0224 #plf = _plinds_full # too expensive 0225 plf = _plinds_cont 0226 pls_old = plf(len(els_old), len(els_old) + len(els_new), 0) 0227 pls_new = plf(len(els_new), len(els_old) + len(els_new), 0) 0228 pls_old.reverse() # so that last old-new pair is top-bottom 0229 maxsim = 0.0 0230 opt_pairs = (pls_old[-1], pls_new[-1]) 0231 i = 0 0232 for pl_old in pls_old: 0233 for pl_new in pls_new: 0234 i += 1 0235 sim = 0.0 0236 pairs = list(zip(pl_old, pl_new)) 0237 for i_old, i_new in pairs: 0238 if i_old is None or i_new is None: 0239 continue 0240 seq_old = reductf(els_old[i_old]) 0241 seq_new = reductf(els_new[i_new]) 0242 r = SequenceMatcher(None, seq_old, seq_new).ratio() 0243 if r < cutoff: 0244 r = 0.0 0245 sim += r 0246 if sim >= maxsim: # >= so that last equal wins 0247 maxsim = sim 0248 opt_pairs = pairs 0249 dlist = [] 0250 for i_old, i_new in opt_pairs: 0251 if i_old is not None: 0252 dlist.append((_old_tag, els_old[i_old])) 0253 if i_new is not None: 0254 dlist.append((_new_tag, els_new[i_new])) 0255 0256 return dlist 0257 0258 0259 def _plinds_full (ninds, nplaces, baseind): 0260 0261 if nplaces < ninds: 0262 return [] 0263 if ninds <= 0: 0264 return [(None,) * nplaces] 0265 else: 0266 return ( [(baseind,) + x 0267 for x in _plinds_full(ninds - 1, nplaces - 1, baseind + 1)] 0268 + [(None,) + x 0269 for x in _plinds_full(ninds, nplaces - 1, baseind)]) 0270 0271 0272 def _plinds_cont (ninds, nplaces, baseind): 0273 0274 pls = [] 0275 insinds = tuple(range(ninds)) 0276 for i in range(nplaces - ninds + 1): 0277 pls.append((None,) * i + insinds + (None,) * (nplaces - ninds - i)) 0278 return pls 0279 0280 0281 def word_diff (text_old, text_new, markup=False, format=None, diffr=False): 0282 """ 0283 Create word-level difference between old and new text. 0284 0285 The difference is computed by looking at texts as collections of words 0286 and intersegments. Difference is presented as a list of tuples, 0287 with each tuple composed of a difference tag and a text segment. 0288 Difference tag is string C{"+"}, C{"-"}, or C{" "}, for text segments 0289 which are new, old, or present in both texts, respectively. 0290 If one of the texts is C{None}, as opposed to empty string, 0291 a tilde is appended to the base difference tag. 0292 0293 The list is ordered such that joining all text segments not marked 0294 as old will reconstruct the new text, and joining all not marked 0295 as new will reconstruct the old text. 0296 0297 If requested by the C{diffr} parameter, also reported is the 0298 I{difference ratio}, a heuristic measure of difference between two texts. 0299 0.0 means no difference, and 1.0 that the texts are completely different. 0300 0301 Differencing may take into account when the texts are expected to have 0302 XML-like markup, or when they are of certain format defined by Gettext. 0303 0304 Examples:: 0305 0306 >>> s1 = "A new type of foo." 0307 >>> s2 = "A new kind of foo." 0308 >>> word_diff(s1, s2) 0309 [(' ', 'A new '), ('+', 'kind'), ('-', 'type'), (' ', ' of foo.')] 0310 >>> word_diff(s1, s2, diffr=True) 0311 ([(' ', 'A new '), ('+', 'kind'), ('-', 'type'), (' ', ' of foo.')], 0312 0.36363636363636365) 0313 >>> word_diff(s1, None, diffr=True) 0314 ([('-~', 'A new type of foo.')], 1.0) 0315 >>> word_diff(None, s2, diffr=True) 0316 ([('+~', 'A new kind of foo.')], 1.0) 0317 0318 @param text_old: the old text 0319 @type text_old: string or None 0320 @param text_new: the new text 0321 @type text_new: string or None 0322 @param markup: whether C{<...>} markup can be expected in the texts 0323 @type markup: bool 0324 @param format: Gettext format flag (e.g. C{"c-format"}, etc.) 0325 @type format: string 0326 @param diffr: whether to report difference ratio 0327 @type diffr: bool 0328 0329 @returns: difference list and possibly difference ratio 0330 @rtype: [(string, string)...] or ([(string, string)...], float) 0331 """ 0332 0333 # Special cases, when one or both texts are None, or both are empty. 0334 specdlist = None 0335 if text_old is None and text_new is None: 0336 specdlist = [] 0337 specdr = 0.0 0338 elif text_old is None: 0339 specdlist = [(_new_tag + _tagext_none, text_new)] 0340 specdr = 1.0 0341 elif text_new is None: 0342 specdlist = [(_old_tag + _tagext_none, text_old)] 0343 specdr = 1.0 0344 elif text_new == "" and text_old == "": 0345 specdlist = [(_equ_tag, "")] 0346 specdr = 0.0 0347 if specdlist is not None: 0348 return diffr and (specdlist, specdr) or specdlist 0349 0350 # Split text into segments: words and intersections, combined into 0351 # single lists for old and new text. Use words as is, but split 0352 # intersections further into single characters. 0353 segments = [] 0354 segment_isintr = [] 0355 0356 def add_segment (intr, word): 0357 segments[-1].extend(list(intr) + [word]) 0358 segment_isintr[-1].extend([True] * len(intr) + [False]) 0359 0360 for text in (text_old, text_new): 0361 lw, li = split_text(text, markup, format) 0362 segments.append([]) 0363 segment_isintr.append([]) 0364 map(add_segment, li, lw + ['']) 0365 0366 # Create the tagged difference. 0367 dlist = tdiff(segments[0], segments[1]) 0368 0369 # Recompute which elements of the difference are intersections. 0370 dlist_isintr = [] 0371 i_old = 0 0372 i_new = 0 0373 for tag, seg in dlist: 0374 if tag == _old_tag: 0375 dlist_isintr.append(segment_isintr[0][i_old]) 0376 else: 0377 dlist_isintr.append(segment_isintr[1][i_new]) 0378 0379 if tag != _new_tag: 0380 i_old += 1 0381 if tag != _old_tag: 0382 i_new += 1 0383 0384 # Reshuffle so that all old-new elements consecutive but for the 0385 # intersections are grouped into all old followed by all new, 0386 # with intersections included in both. 0387 ndlist = [] 0388 i = 0 0389 while i < len(dlist): 0390 while i < len(dlist) and dlist[i][0] == _equ_tag: 0391 ndlist.append(dlist[i]) 0392 i += 1 0393 seq_new = [] 0394 seq_old = [] 0395 i_first_diff = i 0396 i_last_diff = i 0397 while i < len(dlist) and (dlist[i][0] != _equ_tag or dlist_isintr[i]): 0398 if dlist[i][0] != _new_tag: 0399 seq_old.append(dlist[i][1]) 0400 if dlist[i][0] != _old_tag: 0401 seq_new.append(dlist[i][1]) 0402 if dlist[i][0] != _equ_tag: 0403 i_last_diff = i 0404 i += 1 0405 for iex in range(i_last_diff, i - 1): 0406 seq_new.pop() 0407 seq_old.pop() 0408 i = i_last_diff + 1 0409 if seq_old: 0410 ndlist.append((_old_tag, "".join(seq_old))) 0411 if seq_new: 0412 ndlist.append((_new_tag, "".join(seq_new))) 0413 dlist = ndlist 0414 0415 # Join contiguous new/old/both segments, make tagged tuples. 0416 ndlist = [] 0417 S_EQU, S_NEW, S_OLD = list(range(3)) 0418 state = S_EQU 0419 cseg = [] 0420 len_equ, len_old, len_new = 0, 0, 0 0421 _sen_tag = "." 0422 dlist.append((_sen_tag, "")) # sentry 0423 for tag, seg in dlist: 0424 0425 if state == S_EQU and tag in (_new_tag, _old_tag, _sen_tag): 0426 if cseg: 0427 ndlist.append((_equ_tag, "".join(cseg))) 0428 cseg = [] 0429 if tag == _new_tag: 0430 state = S_NEW 0431 else: 0432 state = S_OLD 0433 0434 elif state == S_OLD and tag in (_equ_tag, _new_tag, _sen_tag): 0435 if cseg: 0436 ndlist.append((_old_tag, "".join(cseg))) 0437 cseg = [] 0438 if tag == _equ_tag: 0439 state = S_EQU 0440 else: 0441 state = S_NEW 0442 0443 elif state == S_NEW and tag in (_equ_tag, _old_tag, _sen_tag): 0444 if cseg: 0445 ndlist.append((_new_tag, "".join(cseg))) 0446 cseg = [] 0447 if tag == _equ_tag: 0448 state = S_EQU 0449 else: 0450 state = S_OLD 0451 0452 if tag == _old_tag: 0453 len_old += len(seg) 0454 elif tag == _new_tag: 0455 len_new += len(seg) 0456 else: 0457 len_equ += len(seg) 0458 if seg: 0459 cseg.append(seg) 0460 0461 dlist = ndlist 0462 0463 len_all = len_new + len_old + len_equ 0464 if len_all > 0: 0465 diff_ratio = 1.0 - float(len_equ) / float(len_all) 0466 else: 0467 diff_ratio = 0.0 0468 0469 return diffr and (dlist, diff_ratio) or dlist 0470 0471 0472 def word_ediff (text_old, text_new, markup=False, format=None, colorize=False, 0473 diffr=False): 0474 """ 0475 Create word-level embedded difference between old and new texts. 0476 0477 Same as L{word_diff}, but the difference is returned as text in 0478 which the new segments are wrapped as C{{+...+}}, and the old 0479 segments as C{{-...-}}. 0480 If a difference wrapper is already contained in the text, it will be 0481 escaped by inserting a tilde, e.g. C{"{+...+}"} -> C{"{~+...+~}"}. 0482 If even an escaped wrapper is contained in the text, another tilde 0483 is inserted, and so on. 0484 0485 If one of the texts is C{None}, then the whole other text is wrapped 0486 as suitable difference, and a tilde added to its end to indicate that 0487 the other text was C{None}. 0488 If neither of the texts is C{None}, but after differencing the tilde 0489 appears in the end of embedded difference, it is escaped by another 0490 tilde. 0491 If both texts are C{None}, C{None} is returned as the difference. 0492 0493 The C{colorize} parameter can be used to additionally highlight 0494 embedded difference by using color markup provided by 0495 L{ColorString<colors.ColorString>}. 0496 If colorizing is enabled, the return value is a C{ColorString}. 0497 0498 See L{word_diff} for description of other parameters. 0499 0500 @param colorize: whether to colorize differences 0501 @type colorize: bool 0502 0503 @returns: string with embedded differences and possibly difference ratio 0504 @rtype: string/ColorString/None or (string/ColorString/None, float) 0505 0506 @see: L{word_diff} 0507 """ 0508 0509 dlist, dr = word_diff(text_old, text_new, markup, format, diffr=True) 0510 if not dlist: 0511 return diffr and (None, 0.0) or None 0512 dtext = _assemble_ediff(dlist, colorize) 0513 0514 return diffr and (dtext, dr) or dtext 0515 0516 0517 _capt_old_rx = re.compile( "\\" + _old_opnc + "\\" + _old_vtag + "(.*?)" \ 0518 + "\\" + _old_vtag + "\\" + _old_clsc, re.U|re.S) 0519 _capt_new_rx = re.compile( "\\" + _new_opnc + "\\" + _new_vtag + "(.*?)" \ 0520 + "\\" + _new_vtag + "\\" + _new_clsc, re.U|re.S) 0521 0522 0523 def word_ediff_to_old (dtext): 0524 """ 0525 Recover old version (-) from text with embedded differences. 0526 0527 In case there was no old text, C{None} is returned. 0528 0529 @param dtext: text with embedded differences 0530 @type dtext: string 0531 0532 @returns: old version of the text 0533 @rtype: string or None 0534 0535 @see: L{word_ediff} 0536 """ 0537 0538 return _word_ediff_to_oldnew(dtext, _capt_old_rx, _capt_new_rx) 0539 0540 0541 def word_ediff_to_new (dtext): 0542 """ 0543 Recover new version (+) from text with embedded differences. 0544 0545 In case there was no new text, C{None} is returned. 0546 0547 @param dtext: text with embedded differences 0548 @type dtext: string 0549 0550 @returns: new version of the text 0551 @rtype: string or None 0552 0553 @see: L{word_ediff} 0554 """ 0555 0556 return _word_ediff_to_oldnew(dtext, _capt_new_rx, _capt_old_rx) 0557 0558 0559 def _word_ediff_to_oldnew (dtext, capt_this_rx, capt_other_rx): 0560 0561 if dtext is None: 0562 return None 0563 if isinstance(dtext, ColorString): 0564 dtext = dtext.resolve("none") 0565 text = dtext 0566 text = capt_this_rx.sub(r"\1", text) 0567 text = capt_other_rx.sub(r"", text) 0568 text = _unescape_ewraps(text) 0569 if text.endswith(_tagext_none): 0570 text = text[:-_tagext_none_len] 0571 if not text and capt_other_rx.search(dtext): 0572 text = None 0573 return text 0574 0575 0576 def word_ediff_to_rem (dtext, sep=" "): 0577 """ 0578 Recover removed segments (-) from text with embedded differences. 0579 0580 If separator is not C{None}, the joined string of selected segments 0581 is returned. Otherwise, the list of selected segments is returned. 0582 In either case, if there was no old text, C{None} is returned. 0583 0584 @param dtext: text with embedded differences 0585 @type dtext: string 0586 @param sep: separator with which to join selected segments 0587 @type sep: string or None 0588 0589 @returns: text with only the removed segments 0590 @rtype: string or list or None 0591 0592 @see: L{word_ediff} 0593 """ 0594 0595 return _word_ediff_to_addrem(dtext, _capt_old_rx, sep) 0596 0597 0598 def word_ediff_to_add (dtext, sep=" "): 0599 """ 0600 Recover added segments (+) from text with embedded differences. 0601 0602 If separator is not C{None}, the joined string of selected segments 0603 is returned. Otherwise, the list of selected segments is returned. 0604 In either case, if there was no new text, C{None} is returned. 0605 0606 @param dtext: text with embedded differences 0607 @type dtext: string 0608 @param sep: separator with which to join selected segments 0609 @type sep: string or None 0610 0611 @returns: text with only the added segments 0612 @rtype: string or list or None 0613 0614 @see: L{word_ediff} 0615 """ 0616 0617 return _word_ediff_to_addrem(dtext, _capt_new_rx, sep) 0618 0619 0620 def _word_ediff_to_addrem (dtext, capt_this_rx, sep): 0621 0622 if dtext is None: 0623 return None 0624 if isinstance(dtext, ColorString): 0625 dtext = dtext.resolve("none") 0626 segs = capt_this_rx.findall(dtext) 0627 if sep is not None: 0628 segs = sep.join(segs) 0629 if ( not segs 0630 and dtext.endswith((_old_clsc + _tagext_none, _new_clsc + _tagext_none)) 0631 ): 0632 segs = None 0633 return segs 0634 0635 0636 def line_diff (lines_old, lines_new, markup=False, format=None, diffr=False): 0637 """ 0638 Create word-level difference between old and new lines of text. 0639 0640 First makes a difference on a line-level, and then for each set of 0641 differing lines a difference on word-level, using L{word_diff}. 0642 Difference is presented as a list of tuples of word diffs and ratios 0643 as constructed by L{word_diff}. 0644 See L{word_diff} for description of keyword parameters. 0645 The difference ratio is computed as line-length weighted average 0646 of word difference ratios per line. 0647 0648 @param lines_old: old lines of text 0649 @type lines_old: string 0650 0651 @param lines_new: new lines of text 0652 @type lines_new: string 0653 0654 @returns: difference list and possibly difference ratios 0655 @rtype: [[(string, string)...]...] 0656 or ([([(string, string)...], float)...], float) 0657 """ 0658 0659 # Create the difference. 0660 dlist = tdiff(lines_old, lines_new) 0661 0662 # Reshuffle so that all consecutive old-new lines are grouped into 0663 # all old followed by all new. 0664 # For each old-new set, compute word-diffs and weigh diff-ratios. 0665 wdiffs = [] 0666 sumwdrs = 0.0 0667 sumws = 0.0 0668 i = 0 0669 while i < len(dlist): 0670 while i < len(dlist) and dlist[i][0] == _equ_tag: 0671 seg = dlist[i][1] 0672 wdiffs.append(([(_equ_tag, seg)], 0.0)) 0673 w = len(seg) 0674 sumwdrs += 0.0 * w 0675 sumws += w 0676 i += 1 0677 seq_new = [] 0678 seq_old = [] 0679 while i < len(dlist) and dlist[i][0] != _equ_tag: 0680 seg = dlist[i][1] 0681 if dlist[i][0] != _new_tag: 0682 seq_old.append(seg) 0683 if dlist[i][0] != _old_tag: 0684 seq_new.append(seg) 0685 i += 1 0686 if seq_old and seq_new: 0687 # Decide which line to pair with which by minimal local diff ratio. 0688 # FIXME: Now it tries to place best first line, then second, etc. 0689 # For higher precision, test all combinations. 0690 lold = len(seq_old) 0691 lnew = len(seq_new) 0692 lmax = max(lold, lnew) 0693 lmin = min(lold, lnew) 0694 if lold <= lnew: 0695 s1, s2, tag2, rev = seq_old, seq_new, _new_tag, False 0696 else: 0697 s1, s2, tag2, rev = seq_new, seq_old, _old_tag, True 0698 i1 = 0 0699 i2 = 0 0700 while i1 < lmin: 0701 mindr = 1.1 0702 mwdiff = [] 0703 mj2 = -1 0704 for j2 in range(i2, lmax - lmin + i1 + 1): 0705 if not rev: 0706 t1, t2 = s1[i1], s2[j2] 0707 else: 0708 t1, t2 = s2[j2], s1[i1] 0709 wdiff, dr = word_diff(t1, t2, markup, format, diffr=True) 0710 if mindr > dr: 0711 mindr = dr 0712 mwdiff = wdiff 0713 mj2 = j2 0714 for j2 in range(i2, mj2): 0715 wdiffs.append(([(tag2 + _tagext_none, s2[j2])], 1.0)) 0716 w = len(s2[j2]) 0717 sumwdrs += 1.0 * w 0718 sumws += w 0719 i2 = mj2 0720 wdiffs.append((mwdiff, mindr)) 0721 w = len(s2[i2]) 0722 sumwdrs += mindr * w 0723 sumws += w 0724 i1 += 1 0725 i2 += 1 0726 for j2 in range(i2, lmax): 0727 wdiffs.append(([(tag2 + _tagext_none, s2[j2])], 1.0)) 0728 w = len(s2[j2]) 0729 sumwdrs += 1.0 * w 0730 sumws += w 0731 elif seq_old: 0732 wdiffs.extend([([(_old_tag + _tagext_none, x)], 1.0) 0733 for x in seq_old]) 0734 w = sum(map(len, seq_old)) 0735 sumwdrs += 1.0 * w 0736 sumws += w 0737 elif seq_new: 0738 wdiffs.extend([([(_new_tag + _tagext_none, x)], 1.0) 0739 for x in seq_new]) 0740 w = sum(map(len, seq_new)) 0741 sumwdrs += 1.0 * w 0742 sumws += w 0743 0744 # Weighted-averaged diff-ratio. 0745 dr = sumws > 0.0 and sumwdrs / sumws or 0.0 0746 0747 return diffr and (wdiffs, dr) or [x[0] for x in wdiffs] 0748 0749 0750 def line_ediff (lines_old, lines_new, markup=False, format=None, colorize=False, 0751 diffr=False): 0752 """ 0753 Create word-level embedded difference between old and new lines of text. 0754 0755 Same as L{line_diff}, but the difference is returned as list of tuples 0756 of line of text (in which the new segments are wrapped as C{{+...+}}, 0757 and the old segments as C{{-...-}}) and difference ratio for the line. 0758 See L{word_diff} and L{word_ediff} for description of keyword parameters. 0759 0760 @returns: lines with embedded differences and possibly difference ratios 0761 @rtype: [string...] or ([(string, float)...], float) 0762 0763 @see: L{line_diff} 0764 """ 0765 0766 dlists, dr = line_diff(lines_old, lines_new, markup, format, diffr=True) 0767 dlines = [(_assemble_ediff(x[0], colorize), x[1]) for x in dlists] 0768 0769 return diffr and (dlines, dr) or [x[0] for x in dlines] 0770 0771 0772 def line_ediff_to_old (dlines): 0773 """ 0774 Recover old version (-) from lines of text with embedded differences. 0775 0776 @param dlines: lines of text with embedded differences 0777 @type dlines: list of strings 0778 0779 @returns: old version of the lines 0780 @rtype: list of strings 0781 0782 @see: L{line_ediff} 0783 """ 0784 0785 return _line_ediff_to_oldnew(dlines, word_ediff_to_old) 0786 0787 0788 def line_ediff_to_new (dlines): 0789 """ 0790 Recover new version (+) from lines of text with embedded differences. 0791 0792 @param dlines: lines of text with embedded differences 0793 @type dlines: list of strings 0794 0795 @returns: new version of the lines 0796 @rtype: list of strings 0797 0798 @see: L{line_ediff} 0799 """ 0800 0801 return _line_ediff_to_oldnew(dlines, word_ediff_to_new) 0802 0803 0804 def _line_ediff_to_oldnew (dlines, word_ediff_to_x): 0805 0806 lines = [] 0807 for dline in dlines: 0808 line = word_ediff_to_x(dline) 0809 if line is not None: 0810 lines.append(line) 0811 return lines 0812 0813 0814 def _assemble_ediff (dlist, colorize): 0815 0816 if not dlist: 0817 return None 0818 0819 dtext = [] 0820 other_none = False 0821 for segtag, segtext in dlist: 0822 wext = "" 0823 if segtag.endswith(_tagext_none): 0824 # Can happen only if there is a single difference segment. 0825 segtag = segtag[:-_tagext_none_len] 0826 other_none = True 0827 segtext = _escape_ewraps(segtext) 0828 if segtag == _new_tag: 0829 d = _new_opn + segtext + _new_cls + wext 0830 if colorize: 0831 d = ColorString("<blue>%s</blue>") % d 0832 dtext.append(d) 0833 elif segtag == _old_tag: 0834 d = _old_opn + segtext + _old_cls + wext 0835 if colorize: 0836 d = ColorString("<red>%s</red>") % d 0837 dtext.append(d) 0838 else: 0839 dtext.append(segtext) 0840 haseqseg = True 0841 dtext = cjoin(dtext) 0842 0843 if other_none: 0844 # Indicate the other string was none. 0845 dtext += _tagext_none 0846 elif dtext.endswith(_tagext_none): 0847 # Escape any trailing other-none markers. 0848 dtext += _tagext_none 0849 0850 return dtext 0851 0852 0853 def _escape_ewraps (text): 0854 0855 return _escunesc_ewraps(text, False) 0856 0857 0858 def _unescape_ewraps (text): 0859 0860 return _escunesc_ewraps(text, True) 0861 0862 0863 _ediff_esc = _tagext_none 0864 _ediff_esc_len = len(_ediff_esc) 0865 0866 def _escunesc_ewraps (text, unescape): 0867 0868 for wstart, wend in ( 0869 (_old_opnc, _old_vtag), 0870 (_old_vtag, _old_clsc), 0871 (_new_opnc, _new_vtag), 0872 (_new_vtag, _new_clsc), 0873 ): 0874 segs = [] 0875 p = 0 0876 tlen = len(text) 0877 lwstart = len(wstart) 0878 lwend = len(wend) 0879 while True: 0880 pp = p 0881 p = text.find(wstart, p) 0882 if p < 0: 0883 segs.append(text[pp:]) 0884 break 0885 segs.append(text[pp:p]) 0886 pp = p 0887 p += lwstart 0888 nesc = 0 0889 while p < tlen and text[p:p + _ediff_esc_len] == _ediff_esc: 0890 p += _ediff_esc_len 0891 nesc += 1 0892 if p == tlen or text[p:p + lwend] != wend or (unescape and nesc < 1): 0893 segs.append(text[pp:p]) 0894 else: 0895 if not unescape: 0896 segs.append(text[pp:p] + _ediff_esc + wend) 0897 else: 0898 segs.append(text[pp:p - _ediff_esc_len] + wend) 0899 p += lwend 0900 text = "".join(segs) 0901 0902 return text 0903 0904 0905 def adapt_spans (otext, ftext, spans, merge=True): 0906 """ 0907 Adapt matched spans in filtered text to original text. 0908 0909 Sometimes text gets filtered before being matched, and when a match 0910 is found in the filtered text, it needs to be reported relative to 0911 the original text. This function will heuristically adapt matched spans 0912 relative to the filtered text back to the original text. 0913 0914 Spans are given as list of index tuples C{[(start1, end1), ...]} where 0915 start and end index have standard Python semantics (may be negative too). 0916 If C{merge} is C{True}, any spans that overlap or abut after adaptation 0917 will be merged into a single span, ordered by increasing start index, 0918 and empty spans removed; otherwise each adapted span will strictly 0919 correspond to the input span at that position. 0920 0921 Span tuples may have more elements past the start and end indices. 0922 They will be ignored, but preserved; if merging is in effect, 0923 extra elements will be preserved for only the frontmost of 0924 the overlapping spans (undefined for which if there are several). 0925 0926 If an input span is invalid in any way, 0927 it is carried over verbatim into result. 0928 0929 @param otext: original text 0930 @type otext: string 0931 @param ftext: filtered text 0932 @type ftext: string 0933 @param spans: matched spans 0934 @type spans: list of index tuples 0935 @param merge: whether to merge overlapping spans 0936 @type merge: bool 0937 0938 @returns: adapted spans 0939 @rtype: list of index tuples 0940 """ 0941 0942 if not spans: 0943 return spans 0944 0945 # Resolve negative spans. 0946 # Select out spans with invalid start or end. 0947 flen = len(ftext) 0948 fspans = [] 0949 invalid_spans = [] 0950 for span in spans: 0951 start, end = span[:2] 0952 valid = True 0953 0954 if isinstance(start, int): 0955 if start < 0: 0956 start = flen + start 0957 else: 0958 valid = False 0959 0960 if isinstance(end, int): 0961 if end < 0: 0962 end = flen + end 0963 else: 0964 valid = False 0965 0966 if valid and start > end: 0967 valid = False 0968 0969 if valid: 0970 fspans.append((start, end) + span[2:]) 0971 else: 0972 invalid_spans.append(span) 0973 0974 # Create character-level difference from original to filtered text. 0975 dlist = tdiff(otext, ftext) 0976 0977 # For each span, go through the difference and... do some magic. 0978 aspans = [] 0979 for fspan in fspans: 0980 aspan = [] 0981 for filtered_index, first in zip(fspan[:2], (True, False)): 0982 original_index = 0 0983 original_index_atdiff = 0 0984 track_index = 0 0985 adapted_index = None 0986 stop_at_next_eq = False 0987 for dtag, dseg in dlist: 0988 slen = len(dseg) 0989 if dtag == _new_tag: 0990 track_index += slen 0991 elif dtag == _old_tag: 0992 original_index += slen 0993 else: 0994 original_index += slen 0995 track_index += slen 0996 original_index_atdiff = original_index 0997 if stop_at_next_eq: 0998 break 0999 if track_index >= filtered_index: 1000 exlen = track_index - filtered_index # 0 if char-level diff 1001 if dtag == _equ_tag: 1002 adapted_index = original_index - exlen 1003 break 1004 else: # dtag must be _new_tag 1005 if first: 1006 adapted_index = original_index_atdiff 1007 break 1008 else: 1009 stop_at_next_eq = True 1010 if stop_at_next_eq: 1011 adapted_index = original_index 1012 if adapted_index is None: 1013 break 1014 aspan.append(adapted_index) 1015 if adapted_index is not None: 1016 aspan.extend(fspan[2:]) 1017 aspans.append(tuple(aspan)) 1018 1019 # Merge spans if requested. 1020 if merge: 1021 # Sort by start index immediately, for priority of extra elements. 1022 aspans.sort(key=lambda x: x[0]) 1023 maspans = [] 1024 while len(aspans) > 0: 1025 cstart, cend = aspans[0][:2] 1026 extras = aspans[0][2:] 1027 if cstart >= cend: 1028 aspans.pop(0) # remove empty spans 1029 continue 1030 i = 0 1031 while i < len(aspans): 1032 start, end = aspans[i][:2] 1033 if cend >= start and cstart <= end: 1034 cstart = min(cstart, start) 1035 cend = max(cend, end) 1036 aspans.pop(i) 1037 else: 1038 i += 1 1039 maspans.append((cstart, cend) + extras) 1040 # Sort by start index. 1041 maspans.sort(key=lambda x: x[0]) 1042 aspans = maspans 1043 1044 # Put invalid spans back. 1045 aspans.extend(invalid_spans) 1046 1047 return aspans 1048 1049 1050 _dt_state, _dt_single, _dt_list = list(range(3)) 1051 1052 _msg_diff_parts = ( 1053 ("obsolete", _dt_state), 1054 ("fuzzy", _dt_state), 1055 ("manual_comment", _dt_list), 1056 ("msgctxt_previous", _dt_single), 1057 ("msgid_previous", _dt_single), 1058 ("msgid_plural_previous", _dt_single), 1059 ("msgctxt", _dt_single), 1060 ("msgid", _dt_single), 1061 ("msgid_plural", _dt_single), 1062 ("msgstr", _dt_list), 1063 ) 1064 _msg_dpart_types = dict(_msg_diff_parts) 1065 1066 _msg_curr_fields = ( 1067 "msgctxt", "msgid", "msgid_plural", 1068 ) 1069 _msg_currprev_fields = [(x, x + "_previous") for x in _msg_curr_fields] 1070 1071 1072 def msg_diff (msg1, msg2, pfilter=None, addrem=None, diffr=False): 1073 """ 1074 Create word-level difference between extraction-invariant parts of messages. 1075 1076 For which parts of a message are considered extraction-invariant, 1077 see description of L{inv<message.Message_base>} instance variable 1078 of message objects. 1079 1080 There are two return modes, depending on the value of C{diffr} parameter. 1081 1082 If C{diffr} is C{False}, the difference is returned as list of 3-tuples of 1083 differences by message part: (part name, part item, word difference). 1084 The part name can be used to fetch the part value from the message, 1085 using L{get()<message.Message_base.get>} method of message objects. 1086 The part item is C{None} for singular message parts (e.g. C{msgid}), 1087 and index for list parts (e.g. C{msgstr}). 1088 See L{word_diff<diff.word_diff>} for the format 1089 of word-level difference. 1090 1091 If C{diffr} is C{True}, then each part difference has a fourth element, 1092 the difference ratio; see L{word_diff} for its semantics. Additionally, 1093 the total difference ratio is computed, based on partial ones 1094 (also counting the zero difference of parts which were equal). 1095 The return value is now a 2-tuple of list of part differences 1096 (as 4-tuples) and the total difference ratio. 1097 1098 Either of the messages can be given as C{None}. In case only one of 1099 the messages is C{None}, the difference of C{msgid} field will show 1100 that this field does not exist in the non-existant message (according to 1101 format of non-existant counterparts of L{word_diff<diff.word_diff>}). 1102 If both messages are C{None}, the difference is empty list, as the 1103 messages are same, even if non-existant. 1104 1105 Every C{msgstr} field can be passed through a filter before differencing, 1106 using the C{pfilter} parameter. 1107 1108 Instead of constructing the full difference, using the C{addrem} parameter 1109 only equal, added, or removed segments can be reported. 1110 The value of this parameter is a string, such that the first character 1111 selects the type of partial difference: one of ('=', "e') for equal, 1112 ('+', 'a') for added, and ('-', 'r') for removed segments, and the 1113 rest of the string is used as separator to join the selected segments 1114 (if the separator is empty, space is used instead). 1115 1116 @param msg1: the message from which to make the difference 1117 @type msg1: L{Message_base<message.Message_base>} or None 1118 @param msg2: the message to which to make the difference 1119 @type msg2: L{Message_base<message.Message_base>} or None 1120 @param pfilter: filter to be applied to translation prior to differencing 1121 @type pfilter: callable 1122 @param addrem: report equal, added or removed segments instead of 1123 full difference, joined by what follows the selection character 1124 @type addrem: string 1125 @param diffr: whether to report difference ratio 1126 @type diffr: bool 1127 1128 @return: difference list 1129 @rtype: [(string, int/None, [(string, string)...])...] 1130 or ([(string, int/None, [(string, string)...], float)...], float) 1131 """ 1132 1133 # Create thoroughly empty dummy messages in place of null messages. 1134 mod_msgs = [] 1135 for msg in (msg1, msg2): 1136 if msg is None: 1137 msg = MessageUnsafe() 1138 msg.msgid = None 1139 msg.msgstr = [] 1140 mod_msgs.append(msg) 1141 msg1, msg2 = mod_msgs 1142 1143 # For partial differencing, decide upon which part of diffs to take. 1144 ar_dtyp = None 1145 if addrem: 1146 mode = addrem[0] 1147 ar_sep = str(addrem[1:] or " ") 1148 if mode in ("=", "e"): 1149 ar_dtyp = _equ_tag 1150 elif mode in ("+", "a"): 1151 ar_dtyp = _new_tag 1152 elif mode in ("-", "r"): 1153 ar_dtyp = _old_tag 1154 else: 1155 raise PologyError( 1156 _("@info", 1157 "Unknown selection mode '%(mode)s' for partial differencing.", 1158 mode=mode)) 1159 1160 # Diff two texts under the given diffing options. 1161 def _twdiff (text1, text2, islines=False, cpfilter=None): 1162 1163 f_diff = islines and line_diff or word_diff 1164 1165 if cpfilter: 1166 if not islines: 1167 text1 = cpfilter(text1) 1168 text2 = cpfilter(text2) 1169 else: 1170 text1 = [cpfilter(x) for x in text1] 1171 text2 = [cpfilter(x) for x in text2] 1172 1173 format = (msg2 or msg1).format 1174 wdiff, dr = f_diff(text1, text2, 1175 markup=True, format=format, diffr=True) 1176 if addrem: 1177 if not islines: 1178 wdiff_part = None 1179 ar_segs = [x for t, x in wdiff if t == ar_dtyp] 1180 if text1 is not None or text2 is not None: 1181 wdiff_part = ar_sep.join(ar_segs) 1182 else: 1183 wdiff_part = [] 1184 for wdiff1, dr1 in wdiff: 1185 ar_segs = [x for t, x in wdiff1 if t == ar_dtyp] 1186 dr1 = 1.0 - dr1 1187 if text1 or text2: 1188 wdiff_part += [(ar_sep.join(ar_segs), dr1)] 1189 wdiff = wdiff_part 1190 dr = 1.0 - dr 1191 1192 return wdiff, dr 1193 1194 # Create diffs of relevant parts. 1195 part_diffs = [] 1196 sumdr = 0.0 1197 sumw = 0.0 # ...unless something cleverer comes up, weigh each part same. 1198 for part, typ in _msg_diff_parts: 1199 if typ == _dt_single: 1200 val1 = msg1.get(part) 1201 val2 = msg2.get(part) 1202 wdiff, dr = _twdiff(val1, val2) 1203 part_diffs.append((part, None, wdiff, dr)) 1204 sumdr += dr * 1.0 1205 sumw += 1.0 1206 elif typ == _dt_list: 1207 lst1 = msg1.get(part) 1208 lst2 = msg2.get(part) 1209 cpf = part == "msgstr" and pfilter or None 1210 wdiffs, totdr = _twdiff(lst1, lst2, islines=True, cpfilter=cpf) 1211 item = 0 1212 for wdiff, dr in wdiffs: 1213 part_diffs.append((part, item, wdiff, dr)) 1214 item += 1 1215 sumdr += dr * 1.0 1216 sumw += 1.0 1217 elif typ == _dt_state: 1218 st1 = msg1.get(part) and part or "" 1219 st2 = msg2.get(part) and part or "" 1220 wdiff, dr = word_diff(st1, st2, diffr=True) 1221 part_diffs.append((part, None, wdiff, dr)) 1222 sumdr += dr * 1.0 1223 sumw += 1.0 1224 else: 1225 raise PologyError( 1226 _("@info", 1227 "Unhandled message part '%(part)s' encountered " 1228 "while differencing.", 1229 part=part)) 1230 1231 if diffr: 1232 dr = sumw and sumdr / sumw or 0.0 1233 return part_diffs, dr 1234 else: 1235 return [x[:3] for x in part_diffs] 1236 1237 1238 _dcmnt_field = "auto_comment" # to use manual_comment would be bad idea 1239 _dcmnt_head = "ediff:" 1240 _dcmnt_head_esc = "~" # must be single character 1241 _dcmnt_sep = ", " 1242 _dcmnt_asep = " " 1243 _dcmnt_ind_state = "state" 1244 _dcmnt_ind_ctxtpad = "ctxtpad" 1245 _dcmnt_ind_infsep = "infsep" 1246 _dcmnt_all_inds = ( # ordered 1247 _dcmnt_ind_state, _dcmnt_ind_ctxtpad, _dcmnt_ind_infsep, 1248 ) 1249 _ctxtpad_sep = "|" 1250 _ctxtpad_noctxt = "~" 1251 _ctxtpad_alnums = "abcdefghijklmnopqrstuvwxyz0123456789" 1252 _infsep_blk = "~=" 1253 _infsep_minlen = 20 1254 1255 def msg_ediff (msg1, msg2, pfilter=None, addrem=None, 1256 emsg=None, ecat=None, eokpos=None, enoctxt=None, 1257 emptydc=False, colorize=False, diffr=False): 1258 """ 1259 Create word-level embedded difference between extraction-invariant 1260 parts of messages. 1261 1262 Like L{msg_diff}, but instead of difference list the result is a message 1263 with embedded differences, of the kind produced by L{word_ediff}. 1264 See L{msg_diff} for description C{pfilter} and C{addrem} parameters, 1265 and L{word_ediff} for the format of embedded differences. 1266 Additionally, if C{pfilter} is given, C{msgstr} fields will be diffed 1267 both with and without the filter, and if the two diffs are not equal, 1268 both embeddings are going to be presented in the field, 1269 suitably visually separated. 1270 1271 By default, a new message with embedded difference will be constructed, 1272 of the type of first non-None of C{msg2} and C{msg1}. 1273 Alternatively, the difference can be embedded into the message supplied 1274 by C{emsg} parameter. 1275 1276 If resulting messages with embedded differences are to be inserted 1277 into a catalog, that catalog can be given by the C{ecat} parameter. 1278 Then, if the key of the resulting message would conflict one of 1279 those already in the catalog, its context will be appropriately padded 1280 to avoid the conflict. 1281 This is done by adding a pipe character and an unspecified number 1282 of alphanumerics (generally junk-looking) to the end of the C{msgctxt}. 1283 In case the conflict with a particular message in the catalog is 1284 acceptable (e.g. when resulting message is to be inserted in its place), 1285 the position of this message can be given by the C{eokpos} parameter. 1286 In case a certain value of C{msgctxt} should be padded regardless 1287 of whether there is a conflict or not, 1288 this value can be given by C{enoctxt} parameter. 1289 1290 An additional automatic comment starting with C{ediff:} 1291 may be added to the message, possibly followed by some indicators 1292 necessary to complete the difference specification. These include: 1293 1294 - C{state <STATE_DIFF> ...}: changes in message state, like 1295 C{obsolete} and C{fuzzy}; e.g. C{state {+obsolete+}} means 1296 that the message has been obsoleted from C{msg1} to C{msg2}, 1297 while C{state {-obsolete-}} means that it has been was revived. 1298 1299 - C{ctxtpad <STRING>}: padding alphanumerics added to the C{msgctxt} 1300 field to avoid key collision with one of the messages from C{ecat}. 1301 1302 - C{infsep <BLOCK> <LENGTH>}: if C{pfilter} was used, this indicator 1303 states the building block and length in blocks of in-field separators. 1304 1305 By default the difference comment is not added if there are no indicators, 1306 but it may be forced by setting C{emptydc} parameter to C{True}. 1307 1308 Embedded differences can be additionally colorized (e.g. for terminal) 1309 by setting C{colorize} parameter to C{True}. 1310 1311 If C{diffr} is C{True}, aside from the message with embedded differences, 1312 the total difference ratio is returned (see L{msg_diff}). 1313 If C{pfilter} is given, the ratio refers to difference under filter. 1314 1315 @param msg1: the message from which to make the difference 1316 @type msg1: L{Message_base<message.Message_base>} or None 1317 @param msg2: the message to which to make the difference 1318 @type msg2: L{Message_base<message.Message_base>} or None 1319 @param pfilter: filter to be applied to translation prior to differencing 1320 @type pfilter: callable 1321 @param addrem: report equal, added or removed segments instead of 1322 full difference, joined by what follows the selection character 1323 @type addrem: string 1324 @param emsg: message to embedd the difference to 1325 @type emsg: L{Message_base<message.Message_base>} 1326 @param ecat: catalog of messages to avoid key conflict with 1327 @type ecat: L{Catalog<catalog.Catalog>} 1328 @param eokpos: position into C{ecat} where key conflict is ignored 1329 @type eokpos: int 1330 @param enoctxt: C{msgctxt} string that should be padded unconditionally 1331 @type enoctxt: string 1332 @param emptydc: whether to add difference comment even if empty 1333 @type emptydc: bool 1334 @param colorize: whether to colorize the difference 1335 @type colorize: bool 1336 @param diffr: whether to report difference ratio 1337 @type diffr: bool 1338 1339 @return: message with embedded differences (or None) 1340 and possibly difference ratio 1341 @rtype: type(emsg or msg2 or msg1 or None) or (type(~), float) 1342 """ 1343 1344 if msg1 is None and msg2 is None: 1345 return not diffr and (None, 0.0) or None 1346 1347 # Compute the difference. 1348 wdiffs, totdr = msg_diff(msg1, msg2, addrem=addrem, diffr=True) 1349 wdiffs_pf = [] 1350 if pfilter: 1351 wdiffs_pf, totdr = msg_diff(msg1, msg2, pfilter=pfilter, 1352 addrem=addrem, diffr=True) 1353 1354 # Construct list of embedded diffs out of original difference list. 1355 if not addrem: 1356 mtoe = lambda x: (x[0], x[1], _assemble_ediff(x[2], colorize), x[3]) 1357 ediffs = list(map(mtoe, wdiffs)) 1358 ediffs_pf = list(map(mtoe, wdiffs_pf)) 1359 else: 1360 ediffs = wdiffs 1361 ediffs_pf = wdiffs_pf 1362 1363 # Construct the message to embed differences into. 1364 if emsg is None: 1365 tmsg = msg2 or msg1 1366 emsg = type(tmsg)() 1367 for part, typ in _msg_diff_parts: 1368 tval = tmsg.get(part) 1369 if tval is not None: 1370 setattr(emsg, part, type(tval)(tval)) 1371 1372 # Indicators for the difference comment. 1373 indargs = {} 1374 1375 # Determine field separator for raw/filtered differences. 1376 if ediffs_pf: 1377 infseplen = _infsep_minlen 1378 infsepinc = 5 1379 infseplen_p = infseplen - 1 1380 while infseplen_p < infseplen: 1381 infsep = _infsep_blk * infseplen 1382 infseplen_p = infseplen 1383 for part, item, ediff, dr in ediffs + ediffs_pf: 1384 if ediff and infsep in ediff: 1385 infseplen += infsepinc 1386 break 1387 indargs[_dcmnt_ind_infsep] = [_infsep_blk, str(infseplen)] 1388 1389 # Embed differences. 1390 for i in range(len(ediffs)): 1391 part, item, ediff, dr = ediffs[i] 1392 typ = _msg_dpart_types[part] 1393 if typ == _dt_single: 1394 setattr(emsg, part, ediff) 1395 elif typ == _dt_list: 1396 lst = emsg.get(part) 1397 lst.extend([""] * (item + 1 - len(lst))) 1398 if ediffs_pf: 1399 ediff_pf = ediffs_pf[i][2] 1400 if ediff_pf and ediff_pf != ediff: 1401 ediff += "\n" + infsep + "\n" + ediff_pf 1402 lst[item] = ediff 1403 elif typ == _dt_state: 1404 stag, spart = wdiffs[i][2][0] 1405 if stag != _equ_tag: 1406 if _dcmnt_ind_state not in indargs: 1407 indargs[_dcmnt_ind_state] = [] 1408 indargs[_dcmnt_ind_state].append(ediff) 1409 sval = bool(stag in (_new_tag, _equ_tag) and spart) 1410 setattr(emsg, part, sval) 1411 else: 1412 raise PologyError( 1413 _("@info", 1414 "Unhandled message part '%(part)s' encountered " 1415 "while differencing.", 1416 part=part)) 1417 1418 # Pad context to avoid conflicts. 1419 if ( (ecat is not None and emsg in ecat and ecat.find(emsg) != eokpos) 1420 or (enoctxt is not None and emsg.msgctxt == enoctxt) 1421 ): 1422 noctxtind = emsg.msgctxt is None and _ctxtpad_noctxt or "" 1423 octxt = emsg.msgctxt or "" 1424 while True: 1425 padding = "".join([random.choice(_ctxtpad_alnums) 1426 for x in range(5)]) 1427 emsg.msgctxt = octxt + _ctxtpad_sep + padding + noctxtind 1428 if ( emsg not in ecat 1429 and (enoctxt is None or emsg.msgctxt != enoctxt) 1430 ): 1431 break 1432 indargs[_dcmnt_ind_ctxtpad] = [padding] 1433 1434 # If any of the existing comments looks like diff comment, escape it. 1435 ecomments = emsg.get(_dcmnt_field) 1436 for i in range(len(ecomments)): 1437 scmnt = ecomments[i].strip() 1438 p = scmnt.find(_dcmnt_head) 1439 if p >= 0 and scmnt[:p] == _dcmnt_head_esc * p: 1440 nwp = 0 1441 while scmnt[nwp].isspace(): 1442 nwp += 1 1443 ecomments[i] = scmnt[:nwp] + _dcmnt_head_esc + scmnt[nwp:] 1444 1445 # Add diff comment. 1446 if indargs or emptydc: 1447 inds = [] 1448 for ind in _dcmnt_all_inds: # to have deterministic ordering 1449 alst = indargs.get(ind) 1450 if alst is not None: 1451 inds.append(cjoin([ind] + alst, _dcmnt_asep)) 1452 dcmnt = _dcmnt_head 1453 if inds: 1454 dcmnt += " " + cjoin(inds, _dcmnt_sep) 1455 ecomments.insert(0, dcmnt) 1456 1457 return diffr and (emsg, totdr) or emsg 1458 1459 1460 def msg_ediff_to_new (emsg, rmsg=None): 1461 """ 1462 Resolve message with embedded difference to the newer message. 1463 1464 Message cannot be properly resolved if C{addrem} parameter 1465 to L{msg_ediff} was used on embedding. 1466 If this function is called on such a message, the result is undefined. 1467 1468 By default a new message object is created, but using the C{rmsg} 1469 parameter, en existing message can be given to be filled with all 1470 the resolved parts (keeping its own, ignored parts). This message can 1471 be the C{emsg} itself. 1472 1473 If the resolved message evaluates to no message, the function 1474 returns C{None}, and C{rmsg} is not touched if it was given. 1475 1476 Any states indicated as added by the difference comment are ignored 1477 in favor of the actual states of embedded difference message. 1478 The two sets should normally be equal, but if they are not, 1479 the actual state in effect overrides the indicated added state. 1480 1481 @param emsg: resolvable message with embedded differences 1482 @type emsg: L{Message_base<message.Message_base>} or None 1483 @param rmsg: message to fill in the resolved parts 1484 @type rmsg: L{Message_base<message.Message_base>} 1485 1486 @return: resolved message (or None) 1487 @rtype: type of first non-None of rmsg, emsg, or None 1488 """ 1489 1490 return _msg_ediff_to_x(emsg, rmsg, new=True) 1491 1492 1493 def msg_ediff_to_old (emsg, rmsg=None): 1494 """ 1495 Resolve message with embedded difference to the older message. 1496 1497 Like L{msg_ediff_to_new}, only constructing the opposite message 1498 (except that states indicated as removed by difference comment are 1499 never ignored, i.e. they always override actual states). 1500 See L{msg_ediff_to_new} for parameters and return values. 1501 """ 1502 1503 return _msg_ediff_to_x(emsg, rmsg, new=False) 1504 1505 1506 def _msg_ediff_to_x (emsg, rmsg, new): 1507 1508 if new: 1509 word_ediff_to_x = word_ediff_to_new 1510 word_ediff_to_o = word_ediff_to_old 1511 line_ediff_to_x = line_ediff_to_new 1512 ignore_state_diff = True 1513 else: 1514 word_ediff_to_x = word_ediff_to_old 1515 word_ediff_to_o = word_ediff_to_new 1516 line_ediff_to_x = line_ediff_to_old 1517 ignore_state_diff = False 1518 1519 # Work on copy if target message not given. 1520 if rmsg is None: 1521 rmsg = type(emsg)(emsg) 1522 1523 # Since rmsg can be emsg itself, collect all attributes to set, 1524 # and set them in the end. 1525 atts_vals = [] 1526 1527 # Parse everything out of diff comment, 1528 # unescape comments which looked like diff comment and were escaped. 1529 states = {} 1530 ctxtpad = None 1531 infsep = None 1532 cmnts = [] 1533 for cmnt in list(emsg.get(_dcmnt_field)): 1534 scmnt = cmnt.strip() 1535 p = scmnt.find(_dcmnt_head) 1536 if p == 0: 1537 dcmnt = scmnt[len(_dcmnt_head):] 1538 # FIXME: Checks for unknown indicators and bad arguments. 1539 for indargs in dcmnt.split(_dcmnt_sep.strip()): 1540 lst = indargs.strip().split(_dcmnt_asep) 1541 ind, args = lst[0], [word_ediff_to_x(x) for x in lst[1:]] 1542 if 0: pass 1543 elif ind == _dcmnt_ind_state: 1544 for arg in args: 1545 if _msg_dpart_types.get(arg) == _dt_state: 1546 states[arg] = True 1547 args_o = [word_ediff_to_o(x) for x in lst[1:]] 1548 for arg in args_o: 1549 if _msg_dpart_types.get(arg) == _dt_state: 1550 states[arg] = False 1551 elif ind == _dcmnt_ind_ctxtpad: 1552 ctxtpad = args[0] 1553 elif ind == _dcmnt_ind_infsep: 1554 infsep = args[0] * int(args[1]) 1555 else: 1556 if p > 0 and scmnt[:p] == _dcmnt_head_esc * p: 1557 nwp = 0 1558 while cmnt[nwp].isspace(): 1559 nwp += 1 1560 cmnt = cmnt[:nwp] + cmnt[nwp + 1:] 1561 cmnts.append(cmnt) 1562 1563 listtype = type(rmsg.msgstr) 1564 1565 # Put back cleaned comments. 1566 atts_vals.append((_dcmnt_field, listtype(cmnts))) 1567 1568 # Remove context padding. 1569 if ctxtpad: 1570 val = emsg.get("msgctxt") 1571 p = val.rfind(ctxtpad or "") 1572 if ( p < 0 1573 or val[p - len(_ctxtpad_sep):p] != _ctxtpad_sep 1574 or val[p + len(ctxtpad):] not in (_ctxtpad_noctxt, "") 1575 ): 1576 raise PologyError(_("@info", "Malformed padded context.")) 1577 if val[p + len(ctxtpad):] != _ctxtpad_noctxt: 1578 val = val[:p - len(_ctxtpad_sep)] 1579 else: 1580 val = None 1581 msgctxt_nopad = val 1582 1583 # Resolve parts. 1584 for part, typ in _msg_diff_parts: 1585 if ctxtpad and part == "msgctxt": 1586 val = msgctxt_nopad 1587 else: 1588 val = emsg.get(part) 1589 if typ == _dt_single: 1590 nval = word_ediff_to_x(val) 1591 if nval == None and part == "msgid": 1592 return None 1593 atts_vals.append((part, nval)) 1594 elif typ == _dt_list: 1595 lst = [] 1596 for el in val: 1597 if infsep: 1598 p = el.find(infsep) 1599 if p >= 0: # strip filtered difference 1600 el = el[:p - 1] # -1 to remove newline 1601 lst.append(el) 1602 nlst = listtype(line_ediff_to_x(lst)) 1603 if nlst == [] and part == "msgstr": 1604 return None 1605 atts_vals.append((part, nlst)) 1606 elif typ == _dt_state: 1607 if not ignore_state_diff: 1608 val = states.get(part) 1609 if val is not None: 1610 atts_vals.append((part, val)) 1611 else: 1612 raise PologyError( 1613 _("@info", 1614 "Unhandled message part '%(part)s' encountered " 1615 "while resolving difference.", 1616 part=part)) 1617 1618 # Set resolved parts for real. 1619 for att, val in atts_vals: 1620 setattr(rmsg, att, val) 1621 1622 return rmsg 1623 1624 1625 def editprob (oldtext, newtext): 1626 """ 1627 Compute the probability that a human would rather edit the old text 1628 to obtain the new text, then write it from scratch. 1629 1630 Classical algorithms to compute similarity ratio between two texts 1631 sometimes produce high ratios for texts which a human would unlikely 1632 consider similar enough to make one text by editing the other, 1633 and vice versa. 1634 This functions uses some heuristics to derive the probability 1635 that one text was really edited by a human into the other. 1636 1637 Not commutative in general. 1638 1639 If one of the texts is given as C{None}, the result is 0.0; 1640 if both are C{None}, the result is 1.0. 1641 1642 @param oldtext: candidate for initial text 1643 @type oldtext: string 1644 @param newtext: current text 1645 @type newtext: string 1646 1647 @returns: the probability of editing the old into the new text [0, 1] 1648 @rtype: float 1649 """ 1650 1651 if oldtext == newtext: 1652 return 1.0 1653 if not oldtext or not newtext: 1654 return 0.0 1655 1656 # Consider always the case of editing from longer to shorter text. 1657 if len(oldtext) < len(newtext): 1658 shorttext, longtext = oldtext, newtext 1659 else: 1660 shorttext, longtext = newtext, oldtext 1661 1662 # Construct diff. 1663 sm = SequenceMatcher(None, longtext, shorttext) 1664 mblocks = sm.get_matching_blocks() 1665 mblocks = sorted(mblocks, key=lambda x: x[1]) 1666 mblocks.insert(0, (0, 0, 0)) 1667 1668 # Acummulate probability. 1669 ep = 0.0 1670 for i in range(1, len(mblocks) - 1): 1671 lm = mblocks[i][2] 1672 ld1 = mblocks[i][1] - (mblocks[i - 1][1] + mblocks[i - 1][2]) 1673 ld2 = mblocks[i + 1][1] - (mblocks[i][1] + mblocks[i][2]) 1674 cf = (float(lm) / (lm + ld1 + ld2))**2 1675 # ...if cf would be set to 1, probability would be equal 1676 # to ordinary similarity ratio. 1677 ep += lm * cf 1678 ep /= len(shorttext) 1679 1680 # Correct for different lengths of texts. 1681 rl = float(len(shorttext)) / len(longtext) 1682 ep *= 1 - (rl - 1)**4 1683 1684 return ep 1685 1686 1687 def descprob (descpath, ancpath, cutoff=None, getcsz=False): 1688 """ 1689 Compute the probability that one PO file is a descendant of another. 1690 1691 Sometimes PO files are renamed, split into two, joined into one, 1692 also with possible small changes in messages between old and new set. 1693 This functions uses some heuristics to derive the probability 1694 that the PO file given by C{apath} is an ancestor of the PO file 1695 given by C{dpath}. 1696 If the probability cannot be determined (for whatever reason, 1697 e.g. if the file contains syntax errors), C{None} is returned. 1698 1699 By default, only equality versus non-equality of messages is 1700 taken into consideration. If C{cutoff} is set to a number 0.0-1.0, 1701 then fuzzy matching is performed, and partial similarities greater 1702 than the cutoff are counted into the final probability. 1703 However, this reduces performance by orders of magnitude 1704 (the more the lower the cutoff; 0.7-0.8 may be a reasonable tradeoff). 1705 1706 @param descpath: path to possible descendent PO file 1707 @type descpath: string 1708 @param ancpath: path to possible ancestor PO file 1709 @type ancpath: string 1710 @param cutoff: the cuttoff for fuzzy matching 1711 @type cutoff: float 1712 @param getcsz: also report the referent character sizes 1713 of the first and second file 1714 @type getcsz: bool 1715 1716 @returns: the probability of ancestry [0, 1], 1717 the referent character sizes if requested 1718 @rtype: C{None} or float or (float, int, int) 1719 """ 1720 1721 # Read representative texts of messages. 1722 # Ignore non-unique texts (contexts may have been stripped). 1723 dtexts = set(_read_msg_texts(descpath)) 1724 atexts = set(_read_msg_texts(ancpath)) 1725 1726 # Make the computation commutative, by always taking 1727 # the file with less text as possible descendant. 1728 dtotchar = sum(len(t) for t in dtexts) 1729 atotchar = sum(len(t) for t in atexts) 1730 if getcsz: 1731 dtotchar_orig = dtotchar 1732 atotchar_orig = atotchar 1733 if dtotchar > atotchar: 1734 dtexts, atexts = atexts, dtexts 1735 dtotchar, atotchar = atotchar, dtotchar 1736 1737 # Count how many texts from descendant are in ancestor too. 1738 # This gives basic probability. 1739 neq = len(dtexts.intersection(atexts)) 1740 prob = float(neq) / len(dtexts) 1741 1742 # For each text in descendant not found in ancestor, 1743 # sum similarity ratios to nearest text in ancestor, 1744 # and add to the probability. 1745 if cutoff is not None: 1746 sumsim = 0.0 1747 for dt in dtexts.difference(atexts): 1748 seqm = SequenceMatcher() 1749 seqm.set_seq2(dt) 1750 maxsim = 0.0 1751 for at in atexts: 1752 seqm.set_seq1(at) 1753 sim = seqm.real_quick_ratio() 1754 if sim > cutoff: 1755 sim = seqm.quick_ratio() 1756 if sim > cutoff: 1757 sim = seqm.ratio() 1758 if sim > cutoff: 1759 maxsim = max(maxsim, sim) 1760 sumsim += maxsim 1761 prob += sumsim / len(dtexts) 1762 1763 # Correct probability for small files. 1764 # This is necessary due to enforced commutativity above. 1765 limtotchar = 100 # e.g. 10 messages with 2 words (10 characters) each 1766 if dtotchar < limtotchar: 1767 prob *= (float(dtotchar) / atotchar)**0.5 1768 1769 if getcsz: 1770 return prob, dtotchar_orig, atotchar_orig 1771 else: 1772 return prob 1773 1774 1775 def _read_msg_texts (path): 1776 1777 # NOTE: This function needs to be as fast as possible, 1778 # so instead of using file.Catalog, the file is manually parsed 1779 # to the necessary minimum. 1780 # It is more important to be fast than correct, 1781 # so parsing ignores some valid but highly unusual PO formatting. 1782 1783 # NOTE: Intentionally ignoring: file encoding, escaping, msgctxt. 1784 1785 try: 1786 lines = open(path).readlines() 1787 except: 1788 raise PologyError( 1789 _("@info", 1790 "Cannot read file '%(file)s'.", 1791 file=path)) 1792 1793 msgids = [] 1794 inmsgid = False 1795 for line in lines: 1796 line = line.strip() 1797 if line.startswith("msgid "): 1798 segs = [] 1799 line = line[5:].strip() 1800 inmsgid = True 1801 elif not line.startswith("\""): 1802 if inmsgid: 1803 msgid = "".join(segs) 1804 msgids.append(msgid) 1805 inmsgid = False 1806 if inmsgid: 1807 segs.append(line[1:-1]) # strip quotes 1808 1809 return msgids 1810