pology/pology/diff.py

0001 # -*- coding: UTF-8 -*
0002
0003 """
0004 Produce special diffs between strings and other interesting objects.
0005
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009
0010 from difflib import SequenceMatcher
0011 import random
0012 import re
0013
0014 from pology import PologyError, _, n_
0015 from pology.colors import ColorString, cjoin
0016 from pology.message import MessageUnsafe
0017 from pology.report import error
0018 from pology.split import split_text
0019
0020
0021 _new_tag = "+"
0022
0023 _new_vtag = "+"
0024 _new_opnc = "{"
0025 _new_clsc = "}"
0026
0027 _old_tag = "-"
0028
0029 _old_vtag = "-"
0030 _old_opnc = "{"
0031 _old_clsc = "}"
0032
0033 _equ_tag = " "
0034
0035 _tagext_none = "~"
0036 _tagext_none_len = len(_tagext_none)
0037
0038 _new_opn = _new_opnc + _new_vtag
0039 _new_cls = _new_vtag + _new_clsc
0040 _old_opn = _old_opnc + _old_vtag
0041 _old_cls = _old_vtag + _old_clsc
0042 _all_wrappers = set((_new_opn, _new_cls, _old_opn, _old_cls))
0043
0044 _tmp_wr = (_new_vtag, _new_opnc, _new_clsc, _old_vtag, _old_opnc, _old_clsc)
0045 _tmp_wrlen = list(map(len, _tmp_wr))
0046 if max(_tmp_wrlen) != 1 or min(_tmp_wrlen) != 1:
0047     error(_("@info \"ediff\" is shorthand for \"embedded difference\"",
0048             "All ediff wrapper elements must be of unit length."))
0049
0050
0051 class _Sequence_diff_wrapper:
0052
0053     def __init__ (self, obj, reductf=None):
0054         self.obj = obj
0055         self._robj = (reductf or (lambda x: x))(obj)
0056
0057     def __hash__ (self):
0058         return hash(self._robj)
0059
0060     def __iter__ (self):
0061         return iter(self._robj)
0062
0063     def __eq__ (self, other):
0064         return self._robj == other._robj
0065
0066
0067 def tdiff (seq_old, seq_new, reductf=None, diffr=False):
0068     """
0069     Create tagged difference of two sequences.
0070
0071     Difference is presented as a list of tuples,
0072     with each tuple composed of a difference tag and a sequence element.
0073     Difference tag is string C{"+"}, C{"-"}, or C{" "}, for elements which
0074     belong to the old, the new, or to both sequences, respectively.
0075
0076     The list is ordered such that collecting all elements not tagged
0077     as old will reconstruct the new sequence, and collecting all not tagged
0078     as new will reconstruct the old sequence.
0079
0080     If requested by the C{diffr} parameter, also reported is the
0081     I{difference ratio}, a heuristic measure of difference between two texts.
0082     0.0 means no difference, and 1.0 that sequences are completely different.
0083
0084     Examples::
0085
0086         >>> s1 = "A type of foo".split()
0087         >>> s2 = "A kind of foo".split()
0088         >>> tdiff(s1, s2)
0089         [(' ', 'A'), ('-', 'type'), ('+', 'kind'), (' ', 'of'), (' ', 'foo')]
0090         >>> tdiff(s1, s2, diffr=True)
0091         ([(' ', 'A'), ('-', 'type'), ('+', 'kind'), (' ', 'of'), (' ', 'foo')],
0092         0.25)
0093
0094     To be able to diff them, sequence elements only need to be hashable.
0095     However, for compound elements it may be better to diff them
0096     only by some subset of data, e.g. by one of their string attributes.
0097     Parameter C{reductf} can be used to specify a reduction function, which
0098     will be called on each element to produce its diffing representative.
0099
0100     @param seq_old: sequence to diff from
0101     @type seq_old: sequence with hashable elements
0102     @param seq_new: sequence to diff to
0103     @type seq_new: sequence with hashable elements
0104     @param reductf: function to produce diffing representatives
0105     @type reductf: (sequence element) -> diffing representative
0106     @param diffr: whether to report difference ratio
0107     @type diffr: bool
0108
0109     @returns: difference list and possibly difference ratio
0110     @rtype: [(string, element)...] or ([(string, element)...], float)
0111     """
0112
0113     if reductf is not None:
0114         seq_old = [_Sequence_diff_wrapper(x, reductf) for x in seq_old]
0115         seq_new = [_Sequence_diff_wrapper(x, reductf) for x in seq_new]
0116
0117     dlist = []
0118     seqmatch = SequenceMatcher(None, seq_old, seq_new)
0119     opcodes = seqmatch.get_opcodes()
0120     if diffr:
0121         dr = 1.0 - seqmatch.ratio()
0122     for opcode, i1, i2, j1, j2 in opcodes:
0123         if opcode == "equal":
0124             dlist.extend([(_equ_tag, el) for el in seq_old[i1:i2]])
0125         elif opcode == "replace":
0126             dlist.extend([(_old_tag, el) for el in seq_old[i1:i2]])
0127             dlist.extend([(_new_tag, el) for el in seq_new[j1:j2]])
0128         elif opcode == "delete":
0129             dlist.extend([(_old_tag, el) for el in seq_old[i1:i2]])
0130         elif opcode == "insert":
0131             dlist.extend([(_new_tag, el) for el in seq_new[j1:j2]])
0132         else:
0133             raise PologyError(
0134                 _("@info \"opcode\" is shorthand for \"operation code\"",
0135                   "Unknown opcode '%(code)s' from sequence matcher.",
0136                   code=opcode))
0137
0138     if reductf is not None:
0139         dlist = [(tag, el.obj) for tag, el in dlist]
0140
0141     return diffr and (dlist, dr) or dlist
0142
0143
0144 def itdiff (seq_old, seq_new, reductf=None, cutoff=0.6, diffr=False):
0145     """
0146     Create interleaved tagged difference of two sequences.
0147
0148     Similar to L{tdiff}, except that blocks of added/removed elements
0149     are further heuristically interleaved by similarity, such that
0150     each removed element may be followed by a similar added element,
0151     if such has been determined.
0152     This is useful e.g. to be able to afterwards make inner difference
0153     of each two paired similar elements (e.g. word diff within line diff).
0154
0155     Example::
0156
0157         >>> s1 = "Two blue airplanes".split()
0158         >>> s2 = "Two bluish ships".split()
0159         >>> tdiff(s1, s2)
0160         [(' ', 'Two'), ('-', 'blue'), ('-', 'airplanes'), ('+', 'bluish'),
0161          ('+', 'ships')]
0162         >>> itdiff(s1, s2)
0163         [(' ', 'Two'), ('-', 'blue'), ('+', 'bluish'), ('-', 'airplanes'),
0164          ('+', 'ships')]
0165
0166     To be able to interleave blocks, each element in turn must be
0167     a sequence in its own. This means that function supplied by C{reductf},
0168     otherwise of same semantics as in L{tdiff}, here must also produce
0169     a sequence as diffing representative (e.g. a string).
0170
0171     Parameter C{cutoff} states the minimal similarity between
0172     two elements needed for them to be considered similar at all.
0173
0174     @param seq_old: sequence to diff from
0175     @type seq_old: sequence with hashable elements
0176     @param seq_new: sequence to diff to
0177     @type seq_new: sequence with hashable elements
0178     @param reductf: function to produce diffing representatives
0179     @type reductf: (sequence element) -> representative sequence
0180     @param cutoff: minimal similarity to consider elements similar
0181     @type cutoff: float [0, 1]
0182     @param diffr: whether to report difference ratio
0183     @type diffr: bool
0184
0185     @returns: interleaved difference list and possibly difference ratio
0186     @rtype: [(string, element)...] or ([(string, element)...], float)
0187     """
0188
0189     dres = tdiff(seq_old, seq_new, reductf=reductf, diffr=diffr)
0190     if diffr:
0191         dlist, dr = dres
0192     else:
0193         dlist = dres
0194     lendl = len(dlist)
0195     idlist = []
0196     i = 0
0197     while True:
0198         while i < lendl and dlist[i][0] == _equ_tag:
0199             idlist.append(dlist[i])
0200             i += 1
0201         if i >= lendl:
0202             break
0203         els_old = []
0204         els_new = []
0205         while i < lendl and dlist[i][0] != _equ_tag:
0206             if dlist[i][0] == _old_tag:
0207                 els_old.append(dlist[i][1])
0208             else:
0209                 els_new.append(dlist[i][1])
0210             i += 1
0211         if els_old and els_new:
0212             idlist.extend(_dinterleave(els_old, els_new, reductf, cutoff))
0213         else:
0214             idlist.extend([(_old_tag, x) for x in els_old])
0215             idlist.extend([(_new_tag, x) for x in els_new])
0216
0217     return diffr and (idlist, dr) or idlist
0218
0219
0220 def _dinterleave (els_old, els_new, reductf, cutoff):
0221
0222     reductf = reductf or (lambda x: x)
0223
0224     #plf = _plinds_full # too expensive
0225     plf = _plinds_cont
0226     pls_old = plf(len(els_old), len(els_old) + len(els_new), 0)
0227     pls_new = plf(len(els_new), len(els_old) + len(els_new), 0)
0228     pls_old.reverse() # so that last old-new pair is top-bottom
0229     maxsim = 0.0
0230     opt_pairs = (pls_old[-1], pls_new[-1])
0231     i = 0
0232     for pl_old in pls_old:
0233         for pl_new in pls_new:
0234             i += 1
0235             sim = 0.0
0236             pairs = list(zip(pl_old, pl_new))
0237             for i_old, i_new in pairs:
0238                 if i_old is None or i_new is None:
0239                     continue
0240                 seq_old = reductf(els_old[i_old])
0241                 seq_new = reductf(els_new[i_new])
0242                 r = SequenceMatcher(None, seq_old, seq_new).ratio()
0243                 if r < cutoff:
0244                     r = 0.0
0245                 sim += r
0246             if sim >= maxsim: # >= so that last equal wins
0247                 maxsim = sim
0248                 opt_pairs = pairs
0249     dlist = []
0250     for i_old, i_new in opt_pairs:
0251         if i_old is not None:
0252             dlist.append((_old_tag, els_old[i_old]))
0253         if i_new is not None:
0254             dlist.append((_new_tag, els_new[i_new]))
0255
0256     return dlist
0257
0258
0259 def _plinds_full (ninds, nplaces, baseind):
0260
0261     if nplaces < ninds:
0262         return []
0263     if ninds <= 0:
0264         return [(None,) * nplaces]
0265     else:
0266         return (  [(baseind,) + x
0267                    for x in _plinds_full(ninds - 1, nplaces - 1, baseind + 1)]
0268                 + [(None,) + x
0269                    for x in _plinds_full(ninds, nplaces - 1, baseind)])
0270
0271
0272 def _plinds_cont (ninds, nplaces, baseind):
0273
0274     pls = []
0275     insinds = tuple(range(ninds))
0276     for i in range(nplaces - ninds + 1):
0277         pls.append((None,) * i + insinds + (None,) * (nplaces - ninds - i))
0278     return pls
0279
0280
0281 def word_diff (text_old, text_new, markup=False, format=None, diffr=False):
0282     """
0283     Create word-level difference between old and new text.
0284
0285     The difference is computed by looking at texts as collections of words
0286     and intersegments. Difference is presented as a list of tuples,
0287     with each tuple composed of a difference tag and a text segment.
0288     Difference tag is string C{"+"}, C{"-"}, or C{" "}, for text segments
0289     which are new, old, or present in both texts, respectively.
0290     If one of the texts is C{None}, as opposed to empty string,
0291     a tilde is appended to the base difference tag.
0292
0293     The list is ordered such that joining all text segments not marked
0294     as old will reconstruct the new text, and joining all not marked
0295     as new will reconstruct the old text.
0296
0297     If requested by the C{diffr} parameter, also reported is the
0298     I{difference ratio}, a heuristic measure of difference between two texts.
0299     0.0 means no difference, and 1.0 that the texts are completely different.
0300
0301     Differencing may take into account when the texts are expected to have
0302     XML-like markup, or when they are of certain format defined by Gettext.
0303
0304     Examples::
0305
0306         >>> s1 = "A new type of foo."
0307         >>> s2 = "A new kind of foo."
0308         >>> word_diff(s1, s2)
0309         [(' ', 'A new '), ('+', 'kind'), ('-', 'type'), (' ', ' of foo.')]
0310         >>> word_diff(s1, s2, diffr=True)
0311         ([(' ', 'A new '), ('+', 'kind'), ('-', 'type'), (' ', ' of foo.')],
0312         0.36363636363636365)
0313         >>> word_diff(s1, None, diffr=True)
0314         ([('-~', 'A new type of foo.')], 1.0)
0315         >>> word_diff(None, s2, diffr=True)
0316         ([('+~', 'A new kind of foo.')], 1.0)
0317
0318     @param text_old: the old text
0319     @type text_old: string or None
0320     @param text_new: the new text
0321     @type text_new: string or None
0322     @param markup: whether C{<...>} markup can be expected in the texts
0323     @type markup: bool
0324     @param format: Gettext format flag (e.g. C{"c-format"}, etc.)
0325     @type format: string
0326     @param diffr: whether to report difference ratio
0327     @type diffr: bool
0328
0329     @returns: difference list and possibly difference ratio
0330     @rtype: [(string, string)...] or ([(string, string)...], float)
0331     """
0332
0333     # Special cases, when one or both texts are None, or both are empty.
0334     specdlist = None
0335     if text_old is None and text_new is None:
0336         specdlist = []
0337         specdr = 0.0
0338     elif text_old is None:
0339         specdlist = [(_new_tag + _tagext_none, text_new)]
0340         specdr = 1.0
0341     elif text_new is None:
0342         specdlist = [(_old_tag + _tagext_none, text_old)]
0343         specdr = 1.0
0344     elif text_new == "" and text_old == "":
0345         specdlist = [(_equ_tag, "")]
0346         specdr = 0.0
0347     if specdlist is not None:
0348         return diffr and (specdlist, specdr) or specdlist
0349
0350     # Split text into segments: words and intersections, combined into
0351     # single lists for old and new text. Use words as is, but split
0352     # intersections further into single characters.
0353     segments = []
0354     segment_isintr = []
0355
0356     def add_segment (intr, word):
0357         segments[-1].extend(list(intr) + [word])
0358         segment_isintr[-1].extend([True] * len(intr) + [False])
0359
0360     for text in (text_old, text_new):
0361         lw, li = split_text(text, markup, format)
0362         segments.append([])
0363         segment_isintr.append([])
0364         map(add_segment, li, lw + [''])
0365
0366     # Create the tagged difference.
0367     dlist = tdiff(segments[0], segments[1])
0368
0369     # Recompute which elements of the difference are intersections.
0370     dlist_isintr = []
0371     i_old = 0
0372     i_new = 0
0373     for tag, seg in dlist:
0374         if tag == _old_tag:
0375             dlist_isintr.append(segment_isintr[0][i_old])
0376         else:
0377             dlist_isintr.append(segment_isintr[1][i_new])
0378
0379         if tag != _new_tag:
0380             i_old += 1
0381         if tag != _old_tag:
0382             i_new += 1
0383
0384     # Reshuffle so that all old-new elements consecutive but for the
0385     # intersections are grouped into all old followed by all new,
0386     # with intersections included in both.
0387     ndlist = []
0388     i = 0
0389     while i < len(dlist):
0390         while i < len(dlist) and dlist[i][0] == _equ_tag:
0391             ndlist.append(dlist[i])
0392             i += 1
0393         seq_new = []
0394         seq_old = []
0395         i_first_diff = i
0396         i_last_diff = i
0397         while i < len(dlist) and (dlist[i][0] != _equ_tag or dlist_isintr[i]):
0398             if dlist[i][0] != _new_tag:
0399                 seq_old.append(dlist[i][1])
0400             if dlist[i][0] != _old_tag:
0401                 seq_new.append(dlist[i][1])
0402             if dlist[i][0] != _equ_tag:
0403                 i_last_diff = i
0404             i += 1
0405         for iex in range(i_last_diff, i - 1):
0406             seq_new.pop()
0407             seq_old.pop()
0408         i = i_last_diff + 1
0409         if seq_old:
0410             ndlist.append((_old_tag, "".join(seq_old)))
0411         if seq_new:
0412             ndlist.append((_new_tag, "".join(seq_new)))
0413     dlist = ndlist
0414
0415     # Join contiguous new/old/both segments, make tagged tuples.
0416     ndlist = []
0417     S_EQU, S_NEW, S_OLD = list(range(3))
0418     state = S_EQU
0419     cseg = []
0420     len_equ, len_old, len_new = 0, 0, 0
0421     _sen_tag = "."
0422     dlist.append((_sen_tag, "")) # sentry
0423     for tag, seg in dlist:
0424
0425         if state == S_EQU and tag in (_new_tag, _old_tag, _sen_tag):
0426             if cseg:
0427                 ndlist.append((_equ_tag, "".join(cseg)))
0428             cseg = []
0429             if tag == _new_tag:
0430                 state = S_NEW
0431             else:
0432                 state = S_OLD
0433
0434         elif state == S_OLD and tag in (_equ_tag, _new_tag, _sen_tag):
0435             if cseg:
0436                 ndlist.append((_old_tag, "".join(cseg)))
0437             cseg = []
0438             if tag == _equ_tag:
0439                 state = S_EQU
0440             else:
0441                 state = S_NEW
0442
0443         elif state == S_NEW and tag in (_equ_tag, _old_tag, _sen_tag):
0444             if cseg:
0445                 ndlist.append((_new_tag, "".join(cseg)))
0446             cseg = []
0447             if tag == _equ_tag:
0448                 state = S_EQU
0449             else:
0450                 state = S_OLD
0451
0452         if tag == _old_tag:
0453             len_old += len(seg)
0454         elif tag == _new_tag:
0455             len_new += len(seg)
0456         else:
0457             len_equ += len(seg)
0458         if seg:
0459             cseg.append(seg)
0460
0461     dlist = ndlist
0462
0463     len_all = len_new + len_old + len_equ
0464     if len_all > 0:
0465         diff_ratio = 1.0 - float(len_equ) / float(len_all)
0466     else:
0467         diff_ratio = 0.0
0468
0469     return diffr and (dlist, diff_ratio) or dlist
0470
0471
0472 def word_ediff (text_old, text_new, markup=False, format=None, colorize=False,
0473                 diffr=False):
0474     """
0475     Create word-level embedded difference between old and new texts.
0476
0477     Same as L{word_diff}, but the difference is returned as text in
0478     which the new segments are wrapped as C{{+...+}}, and the old
0479     segments as C{{-...-}}.
0480     If a difference wrapper is already contained in the text, it will be
0481     escaped by inserting a tilde, e.g. C{"{+...+}"} -> C{"{~+...+~}"}.
0482     If even an escaped wrapper is contained in the text, another tilde
0483     is inserted, and so on.
0484
0485     If one of the texts is C{None}, then the whole other text is wrapped
0486     as suitable difference, and a tilde added to its end to indicate that
0487     the other text was C{None}.
0488     If neither of the texts is C{None}, but after differencing the tilde
0489     appears in the end of embedded difference, it is escaped by another
0490     tilde.
0491     If both texts are C{None}, C{None} is returned as the difference.
0492
0493     The C{colorize} parameter can be used to additionally highlight
0494     embedded difference by using color markup provided by
0495     L{ColorString<colors.ColorString>}.
0496     If colorizing is enabled, the return value is a C{ColorString}.
0497
0498     See L{word_diff} for description of other parameters.
0499
0500     @param colorize: whether to colorize differences
0501     @type colorize: bool
0502
0503     @returns: string with embedded differences and possibly difference ratio
0504     @rtype: string/ColorString/None or (string/ColorString/None, float)
0505
0506     @see: L{word_diff}
0507     """
0508
0509     dlist, dr = word_diff(text_old, text_new, markup, format, diffr=True)
0510     if not dlist:
0511         return diffr and (None, 0.0) or None
0512     dtext = _assemble_ediff(dlist, colorize)
0513
0514     return diffr and (dtext, dr) or dtext
0515
0516
0517 _capt_old_rx = re.compile(  "\\" + _old_opnc + "\\" + _old_vtag + "(.*?)" \
0518                           + "\\" + _old_vtag + "\\" + _old_clsc, re.U|re.S)
0519 _capt_new_rx = re.compile(  "\\" + _new_opnc + "\\" + _new_vtag + "(.*?)" \
0520                           + "\\" + _new_vtag + "\\" + _new_clsc, re.U|re.S)
0521
0522
0523 def word_ediff_to_old (dtext):
0524     """
0525     Recover old version (-) from text with embedded differences.
0526
0527     In case there was no old text, C{None} is returned.
0528
0529     @param dtext: text with embedded differences
0530     @type dtext: string
0531
0532     @returns: old version of the text
0533     @rtype: string or None
0534
0535     @see: L{word_ediff}
0536     """
0537
0538     return _word_ediff_to_oldnew(dtext, _capt_old_rx, _capt_new_rx)
0539
0540
0541 def word_ediff_to_new (dtext):
0542     """
0543     Recover new version (+) from text with embedded differences.
0544
0545     In case there was no new text, C{None} is returned.
0546
0547     @param dtext: text with embedded differences
0548     @type dtext: string
0549
0550     @returns: new version of the text
0551     @rtype: string or None
0552
0553     @see: L{word_ediff}
0554     """
0555
0556     return _word_ediff_to_oldnew(dtext, _capt_new_rx, _capt_old_rx)
0557
0558
0559 def _word_ediff_to_oldnew (dtext, capt_this_rx, capt_other_rx):
0560
0561     if dtext is None:
0562         return None
0563     if isinstance(dtext, ColorString):
0564         dtext = dtext.resolve("none")
0565     text = dtext
0566     text = capt_this_rx.sub(r"\1", text)
0567     text = capt_other_rx.sub(r"", text)
0568     text = _unescape_ewraps(text)
0569     if text.endswith(_tagext_none):
0570         text = text[:-_tagext_none_len]
0571         if not text and capt_other_rx.search(dtext):
0572             text = None
0573     return text
0574
0575
0576 def word_ediff_to_rem (dtext, sep=" "):
0577     """
0578     Recover removed segments (-) from text with embedded differences.
0579
0580     If separator is not C{None}, the joined string of selected segments
0581     is returned. Otherwise, the list of selected segments is returned.
0582     In either case, if there was no old text, C{None} is returned.
0583
0584     @param dtext: text with embedded differences
0585     @type dtext: string
0586     @param sep: separator with which to join selected segments
0587     @type sep: string or None
0588
0589     @returns: text with only the removed segments
0590     @rtype: string or list or None
0591
0592     @see: L{word_ediff}
0593     """
0594
0595     return _word_ediff_to_addrem(dtext, _capt_old_rx, sep)
0596
0597
0598 def word_ediff_to_add (dtext, sep=" "):
0599     """
0600     Recover added segments (+) from text with embedded differences.
0601
0602     If separator is not C{None}, the joined string of selected segments
0603     is returned. Otherwise, the list of selected segments is returned.
0604     In either case, if there was no new text, C{None} is returned.
0605
0606     @param dtext: text with embedded differences
0607     @type dtext: string
0608     @param sep: separator with which to join selected segments
0609     @type sep: string or None
0610
0611     @returns: text with only the added segments
0612     @rtype: string or list or None
0613
0614     @see: L{word_ediff}
0615     """
0616
0617     return _word_ediff_to_addrem(dtext, _capt_new_rx, sep)
0618
0619
0620 def _word_ediff_to_addrem (dtext, capt_this_rx, sep):
0621
0622     if dtext is None:
0623         return None
0624     if isinstance(dtext, ColorString):
0625         dtext = dtext.resolve("none")
0626     segs = capt_this_rx.findall(dtext)
0627     if sep is not None:
0628         segs = sep.join(segs)
0629     if (    not segs
0630         and dtext.endswith((_old_clsc + _tagext_none, _new_clsc + _tagext_none))
0631     ):
0632         segs = None
0633     return segs
0634
0635
0636 def line_diff (lines_old, lines_new, markup=False, format=None, diffr=False):
0637     """
0638     Create word-level difference between old and new lines of text.
0639
0640     First makes a difference on a line-level, and then for each set of
0641     differing lines a difference on word-level, using L{word_diff}.
0642     Difference is presented as a list of tuples of word diffs and ratios
0643     as constructed by L{word_diff}.
0644     See L{word_diff} for description of keyword parameters.
0645     The difference ratio is computed as line-length weighted average
0646     of word difference ratios per line.
0647
0648     @param lines_old: old lines of text
0649     @type lines_old: string
0650
0651     @param lines_new: new lines of text
0652     @type lines_new: string
0653
0654     @returns: difference list and possibly difference ratios
0655     @rtype: [[(string, string)...]...]
0656         or ([([(string, string)...], float)...], float)
0657     """
0658
0659     # Create the difference.
0660     dlist = tdiff(lines_old, lines_new)
0661
0662     # Reshuffle so that all consecutive old-new lines are grouped into
0663     # all old followed by all new.
0664     # For each old-new set, compute word-diffs and weigh diff-ratios.
0665     wdiffs = []
0666     sumwdrs = 0.0
0667     sumws = 0.0
0668     i = 0
0669     while i < len(dlist):
0670         while i < len(dlist) and dlist[i][0] == _equ_tag:
0671             seg = dlist[i][1]
0672             wdiffs.append(([(_equ_tag, seg)], 0.0))
0673             w = len(seg)
0674             sumwdrs += 0.0 * w
0675             sumws += w
0676             i += 1
0677         seq_new = []
0678         seq_old = []
0679         while i < len(dlist) and dlist[i][0] != _equ_tag:
0680             seg = dlist[i][1]
0681             if dlist[i][0] != _new_tag:
0682                 seq_old.append(seg)
0683             if dlist[i][0] != _old_tag:
0684                 seq_new.append(seg)
0685             i += 1
0686         if seq_old and seq_new:
0687             # Decide which line to pair with which by minimal local diff ratio.
0688             # FIXME: Now it tries to place best first line, then second, etc.
0689             # For higher precision, test all combinations.
0690             lold = len(seq_old)
0691             lnew = len(seq_new)
0692             lmax = max(lold, lnew)
0693             lmin = min(lold, lnew)
0694             if lold <= lnew:
0695                 s1, s2, tag2, rev = seq_old, seq_new, _new_tag, False
0696             else:
0697                 s1, s2, tag2, rev = seq_new, seq_old, _old_tag, True
0698             i1 = 0
0699             i2 = 0
0700             while i1 < lmin:
0701                 mindr = 1.1
0702                 mwdiff = []
0703                 mj2 = -1
0704                 for j2 in range(i2, lmax - lmin + i1 + 1):
0705                     if not rev:
0706                         t1, t2 = s1[i1], s2[j2]
0707                     else:
0708                         t1, t2 = s2[j2], s1[i1]
0709                     wdiff, dr = word_diff(t1, t2, markup, format, diffr=True)
0710                     if mindr > dr:
0711                         mindr = dr
0712                         mwdiff = wdiff
0713                         mj2 = j2
0714                 for j2 in range(i2, mj2):
0715                     wdiffs.append(([(tag2 + _tagext_none, s2[j2])], 1.0))
0716                     w = len(s2[j2])
0717                     sumwdrs += 1.0 * w
0718                     sumws += w
0719                 i2 = mj2
0720                 wdiffs.append((mwdiff, mindr))
0721                 w = len(s2[i2])
0722                 sumwdrs += mindr * w
0723                 sumws += w
0724                 i1 += 1
0725                 i2 += 1
0726             for j2 in range(i2, lmax):
0727                 wdiffs.append(([(tag2 + _tagext_none, s2[j2])], 1.0))
0728                 w = len(s2[j2])
0729                 sumwdrs += 1.0 * w
0730                 sumws += w
0731         elif seq_old:
0732             wdiffs.extend([([(_old_tag + _tagext_none, x)], 1.0)
0733                            for x in seq_old])
0734             w = sum(map(len, seq_old))
0735             sumwdrs += 1.0 * w
0736             sumws += w
0737         elif seq_new:
0738             wdiffs.extend([([(_new_tag + _tagext_none, x)], 1.0)
0739                            for x in seq_new])
0740             w = sum(map(len, seq_new))
0741             sumwdrs += 1.0 * w
0742             sumws += w
0743
0744     # Weighted-averaged diff-ratio.
0745     dr = sumws > 0.0 and sumwdrs / sumws or 0.0
0746
0747     return diffr and (wdiffs, dr) or [x[0] for x in wdiffs]
0748
0749
0750 def line_ediff (lines_old, lines_new, markup=False, format=None, colorize=False,
0751                 diffr=False):
0752     """
0753     Create word-level embedded difference between old and new lines of text.
0754
0755     Same as L{line_diff}, but the difference is returned as list of tuples
0756     of line of text (in which the new segments are wrapped as C{{+...+}},
0757     and the old segments as C{{-...-}}) and difference ratio for the line.
0758     See L{word_diff} and L{word_ediff} for description of keyword parameters.
0759
0760     @returns: lines with embedded differences and possibly difference ratios
0761     @rtype: [string...] or ([(string, float)...], float)
0762
0763     @see: L{line_diff}
0764     """
0765
0766     dlists, dr = line_diff(lines_old, lines_new, markup, format, diffr=True)
0767     dlines = [(_assemble_ediff(x[0], colorize), x[1]) for x in dlists]
0768
0769     return diffr and (dlines, dr) or [x[0] for x in dlines]
0770
0771
0772 def line_ediff_to_old (dlines):
0773     """
0774     Recover old version (-) from lines of text with embedded differences.
0775
0776     @param dlines: lines of text with embedded differences
0777     @type dlines: list of strings
0778
0779     @returns: old version of the lines
0780     @rtype: list of strings
0781
0782     @see: L{line_ediff}
0783     """
0784
0785     return _line_ediff_to_oldnew(dlines, word_ediff_to_old)
0786
0787
0788 def line_ediff_to_new (dlines):
0789     """
0790     Recover new version (+) from lines of text with embedded differences.
0791
0792     @param dlines: lines of text with embedded differences
0793     @type dlines: list of strings
0794
0795     @returns: new version of the lines
0796     @rtype: list of strings
0797
0798     @see: L{line_ediff}
0799     """
0800
0801     return _line_ediff_to_oldnew(dlines, word_ediff_to_new)
0802
0803
0804 def _line_ediff_to_oldnew (dlines, word_ediff_to_x):
0805
0806     lines = []
0807     for dline in dlines:
0808         line = word_ediff_to_x(dline)
0809         if line is not None:
0810             lines.append(line)
0811     return lines
0812
0813
0814 def _assemble_ediff (dlist, colorize):
0815
0816     if not dlist:
0817         return None
0818
0819     dtext = []
0820     other_none = False
0821     for segtag, segtext in dlist:
0822         wext = ""
0823         if segtag.endswith(_tagext_none):
0824             # Can happen only if there is a single difference segment.
0825             segtag = segtag[:-_tagext_none_len]
0826             other_none = True
0827         segtext = _escape_ewraps(segtext)
0828         if segtag == _new_tag:
0829             d = _new_opn + segtext + _new_cls + wext
0830             if colorize:
0831                 d = ColorString("<blue>%s</blue>") % d
0832             dtext.append(d)
0833         elif segtag == _old_tag:
0834             d = _old_opn + segtext + _old_cls + wext
0835             if colorize:
0836                 d = ColorString("<red>%s</red>") % d
0837             dtext.append(d)
0838         else:
0839             dtext.append(segtext)
0840             haseqseg = True
0841     dtext = cjoin(dtext)
0842
0843     if other_none:
0844         # Indicate the other string was none.
0845         dtext += _tagext_none
0846     elif dtext.endswith(_tagext_none):
0847         # Escape any trailing other-none markers.
0848         dtext += _tagext_none
0849
0850     return dtext
0851
0852
0853 def _escape_ewraps (text):
0854
0855     return _escunesc_ewraps(text, False)
0856
0857
0858 def _unescape_ewraps (text):
0859
0860     return _escunesc_ewraps(text, True)
0861
0862
0863 _ediff_esc = _tagext_none
0864 _ediff_esc_len = len(_ediff_esc)
0865
0866 def _escunesc_ewraps (text, unescape):
0867
0868     for wstart, wend in (
0869         (_old_opnc, _old_vtag),
0870         (_old_vtag, _old_clsc),
0871         (_new_opnc, _new_vtag),
0872         (_new_vtag, _new_clsc),
0873     ):
0874         segs = []
0875         p = 0
0876         tlen = len(text)
0877         lwstart = len(wstart)
0878         lwend = len(wend)
0879         while True:
0880             pp = p
0881             p = text.find(wstart, p)
0882             if p < 0:
0883                 segs.append(text[pp:])
0884                 break
0885             segs.append(text[pp:p])
0886             pp = p
0887             p += lwstart
0888             nesc = 0
0889             while p < tlen and text[p:p + _ediff_esc_len] == _ediff_esc:
0890                 p += _ediff_esc_len
0891                 nesc += 1
0892             if p == tlen or text[p:p + lwend] != wend or (unescape and nesc < 1):
0893                 segs.append(text[pp:p])
0894             else:
0895                 if not unescape:
0896                     segs.append(text[pp:p] + _ediff_esc + wend)
0897                 else:
0898                     segs.append(text[pp:p - _ediff_esc_len] + wend)
0899                 p += lwend
0900         text = "".join(segs)
0901
0902     return text
0903
0904
0905 def adapt_spans (otext, ftext, spans, merge=True):
0906     """
0907     Adapt matched spans in filtered text to original text.
0908
0909     Sometimes text gets filtered before being matched, and when a match
0910     is found in the filtered text, it needs to be reported relative to
0911     the original text. This function will heuristically adapt matched spans
0912     relative to the filtered text back to the original text.
0913
0914     Spans are given as list of index tuples C{[(start1, end1), ...]} where
0915     start and end index have standard Python semantics (may be negative too).
0916     If C{merge} is C{True}, any spans that overlap or abut after adaptation
0917     will be merged into a single span, ordered by increasing start index,
0918     and empty spans removed; otherwise each adapted span will strictly
0919     correspond to the input span at that position.
0920
0921     Span tuples may have more elements past the start and end indices.
0922     They will be ignored, but preserved; if merging is in effect,
0923     extra elements will be preserved for only the frontmost of
0924     the overlapping spans (undefined for which if there are several).
0925
0926     If an input span is invalid in any way,
0927     it is carried over verbatim into result.
0928
0929     @param otext: original text
0930     @type otext: string
0931     @param ftext: filtered text
0932     @type ftext: string
0933     @param spans: matched spans
0934     @type spans: list of index tuples
0935     @param merge: whether to merge overlapping spans
0936     @type merge: bool
0937
0938     @returns: adapted spans
0939     @rtype: list of index tuples
0940     """
0941
0942     if not spans:
0943         return spans
0944
0945     # Resolve negative spans.
0946     # Select out spans with invalid start or end.
0947     flen = len(ftext)
0948     fspans = []
0949     invalid_spans = []
0950     for span in spans:
0951         start, end = span[:2]
0952         valid = True
0953
0954         if isinstance(start, int):
0955             if start < 0:
0956                 start = flen + start
0957         else:
0958             valid = False
0959
0960         if isinstance(end, int):
0961             if end < 0:
0962                 end = flen + end
0963         else:
0964             valid = False
0965
0966         if valid and start > end:
0967             valid = False
0968
0969         if valid:
0970             fspans.append((start, end) + span[2:])
0971         else:
0972             invalid_spans.append(span)
0973
0974     # Create character-level difference from original to filtered text.
0975     dlist = tdiff(otext, ftext)
0976
0977     # For each span, go through the difference and... do some magic.
0978     aspans = []
0979     for fspan in fspans:
0980         aspan = []
0981         for filtered_index, first in zip(fspan[:2], (True, False)):
0982             original_index = 0
0983             original_index_atdiff = 0
0984             track_index = 0
0985             adapted_index = None
0986             stop_at_next_eq = False
0987             for dtag, dseg in dlist:
0988                 slen = len(dseg)
0989                 if dtag == _new_tag:
0990                     track_index += slen
0991                 elif dtag == _old_tag:
0992                     original_index += slen
0993                 else:
0994                     original_index += slen
0995                     track_index += slen
0996                     original_index_atdiff = original_index
0997                     if stop_at_next_eq:
0998                         break
0999                 if track_index >= filtered_index:
1000                     exlen = track_index - filtered_index # 0 if char-level diff
1001                     if dtag == _equ_tag:
1002                         adapted_index = original_index - exlen
1003                         break
1004                     else: # dtag must be _new_tag
1005                         if first:
1006                             adapted_index = original_index_atdiff
1007                             break
1008                         else:
1009                             stop_at_next_eq = True
1010             if stop_at_next_eq:
1011                 adapted_index = original_index
1012             if adapted_index is None:
1013                 break
1014             aspan.append(adapted_index)
1015         if adapted_index is not None:
1016             aspan.extend(fspan[2:])
1017             aspans.append(tuple(aspan))
1018
1019     # Merge spans if requested.
1020     if merge:
1021         # Sort by start index immediately, for priority of extra elements.
1022         aspans.sort(key=lambda x: x[0])
1023         maspans = []
1024         while len(aspans) > 0:
1025             cstart, cend = aspans[0][:2]
1026             extras = aspans[0][2:]
1027             if cstart >= cend:
1028                 aspans.pop(0) # remove empty spans
1029                 continue
1030             i = 0
1031             while i < len(aspans):
1032                 start, end = aspans[i][:2]
1033                 if cend >= start and cstart <= end:
1034                     cstart = min(cstart, start)
1035                     cend = max(cend, end)
1036                     aspans.pop(i)
1037                 else:
1038                     i += 1
1039             maspans.append((cstart, cend) + extras)
1040         # Sort by start index.
1041         maspans.sort(key=lambda x: x[0])
1042         aspans = maspans
1043
1044     # Put invalid spans back.
1045     aspans.extend(invalid_spans)
1046
1047     return aspans
1048
1049
1050 _dt_state, _dt_single, _dt_list = list(range(3))
1051
1052 _msg_diff_parts = (
1053     ("obsolete", _dt_state),
1054     ("fuzzy", _dt_state),
1055     ("manual_comment", _dt_list),
1056     ("msgctxt_previous", _dt_single),
1057     ("msgid_previous", _dt_single),
1058     ("msgid_plural_previous", _dt_single),
1059     ("msgctxt", _dt_single),
1060     ("msgid", _dt_single),
1061     ("msgid_plural", _dt_single),
1062     ("msgstr", _dt_list),
1063 )
1064 _msg_dpart_types = dict(_msg_diff_parts)
1065
1066 _msg_curr_fields = (
1067     "msgctxt", "msgid", "msgid_plural",
1068 )
1069 _msg_currprev_fields = [(x, x + "_previous") for x in _msg_curr_fields]
1070
1071
1072 def msg_diff (msg1, msg2, pfilter=None, addrem=None, diffr=False):
1073     """
1074     Create word-level difference between extraction-invariant parts of messages.
1075
1076     For which parts of a message are considered extraction-invariant,
1077     see description of L{inv<message.Message_base>} instance variable
1078     of message objects.
1079
1080     There are two return modes, depending on the value of C{diffr} parameter.
1081
1082     If C{diffr} is C{False}, the difference is returned as list of 3-tuples of
1083     differences by message part: (part name, part item, word difference).
1084     The part name can be used to fetch the part value from the message,
1085     using L{get()<message.Message_base.get>} method of message objects.
1086     The part item is C{None} for singular message parts (e.g. C{msgid}),
1087     and index for list parts (e.g. C{msgstr}).
1088     See L{word_diff<diff.word_diff>} for the format
1089     of word-level difference.
1090
1091     If C{diffr} is C{True}, then each part difference has a fourth element,
1092     the difference ratio; see L{word_diff} for its semantics. Additionally,
1093     the total difference ratio is computed, based on partial ones
1094     (also counting the zero difference of parts which were equal).
1095     The return value is now a 2-tuple of list of part differences
1096     (as 4-tuples) and the total difference ratio.
1097
1098     Either of the messages can be given as C{None}. In case only one of
1099     the messages is C{None}, the difference of C{msgid} field will show
1100     that this field does not exist in the non-existant message (according to
1101     format of non-existant counterparts of L{word_diff<diff.word_diff>}).
1102     If both messages are C{None}, the difference is empty list, as the
1103     messages are same, even if non-existant.
1104
1105     Every C{msgstr} field can be passed through a filter before differencing,
1106     using the C{pfilter} parameter.
1107
1108     Instead of constructing the full difference, using the C{addrem} parameter
1109     only equal, added, or removed segments can be reported.
1110     The value of this parameter is a string, such that the first character
1111     selects the type of partial difference: one of ('=', "e') for equal,
1112     ('+', 'a') for added, and ('-', 'r') for removed segments, and the
1113     rest of the string is used as separator to join the selected segments
1114     (if the separator is empty, space is used instead).
1115
1116     @param msg1: the message from which to make the difference
1117     @type msg1: L{Message_base<message.Message_base>} or None
1118     @param msg2: the message to which to make the difference
1119     @type msg2: L{Message_base<message.Message_base>} or None
1120     @param pfilter: filter to be applied to translation prior to differencing
1121     @type pfilter: callable
1122     @param addrem: report equal, added or removed segments instead of
1123         full difference, joined by what follows the selection character
1124     @type addrem: string
1125     @param diffr: whether to report difference ratio
1126     @type diffr: bool
1127
1128     @return: difference list
1129     @rtype: [(string, int/None, [(string, string)...])...]
1130         or ([(string, int/None, [(string, string)...], float)...], float)
1131     """
1132
1133     # Create thoroughly empty dummy messages in place of null messages.
1134     mod_msgs = []
1135     for msg in (msg1, msg2):
1136         if msg is None:
1137             msg = MessageUnsafe()
1138             msg.msgid = None
1139             msg.msgstr = []
1140         mod_msgs.append(msg)
1141     msg1, msg2 = mod_msgs
1142
1143     # For partial differencing, decide upon which part of diffs to take.
1144     ar_dtyp = None
1145     if addrem:
1146         mode = addrem[0]
1147         ar_sep = str(addrem[1:] or " ")
1148         if mode in ("=", "e"):
1149             ar_dtyp = _equ_tag
1150         elif mode in ("+", "a"):
1151             ar_dtyp = _new_tag
1152         elif mode in ("-", "r"):
1153             ar_dtyp = _old_tag
1154         else:
1155             raise PologyError(
1156                 _("@info",
1157                   "Unknown selection mode '%(mode)s' for partial differencing.",
1158                   mode=mode))
1159
1160     # Diff two texts under the given diffing options.
1161     def _twdiff (text1, text2, islines=False, cpfilter=None):
1162
1163         f_diff = islines and line_diff or word_diff
1164
1165         if cpfilter:
1166             if not islines:
1167                 text1 = cpfilter(text1)
1168                 text2 = cpfilter(text2)
1169             else:
1170                 text1 = [cpfilter(x) for x in text1]
1171                 text2 = [cpfilter(x) for x in text2]
1172
1173         format = (msg2 or msg1).format
1174         wdiff, dr = f_diff(text1, text2,
1175                            markup=True, format=format, diffr=True)
1176         if addrem:
1177             if not islines:
1178                 wdiff_part = None
1179                 ar_segs = [x for t, x in wdiff if t == ar_dtyp]
1180                 if text1 is not None or text2 is not None:
1181                     wdiff_part = ar_sep.join(ar_segs)
1182             else:
1183                 wdiff_part = []
1184                 for wdiff1, dr1 in wdiff:
1185                     ar_segs = [x for t, x in wdiff1 if t == ar_dtyp]
1186                     dr1 = 1.0 - dr1
1187                     if text1 or text2:
1188                         wdiff_part += [(ar_sep.join(ar_segs), dr1)]
1189             wdiff = wdiff_part
1190             dr = 1.0 - dr
1191
1192         return wdiff, dr
1193
1194     # Create diffs of relevant parts.
1195     part_diffs = []
1196     sumdr = 0.0
1197     sumw = 0.0 # ...unless something cleverer comes up, weigh each part same.
1198     for part, typ in _msg_diff_parts:
1199         if typ == _dt_single:
1200             val1 = msg1.get(part)
1201             val2 = msg2.get(part)
1202             wdiff, dr = _twdiff(val1, val2)
1203             part_diffs.append((part, None, wdiff, dr))
1204             sumdr += dr * 1.0
1205             sumw += 1.0
1206         elif typ == _dt_list:
1207             lst1 = msg1.get(part)
1208             lst2 = msg2.get(part)
1209             cpf = part == "msgstr" and pfilter or None
1210             wdiffs, totdr = _twdiff(lst1, lst2, islines=True, cpfilter=cpf)
1211             item = 0
1212             for wdiff, dr in wdiffs:
1213                 part_diffs.append((part, item, wdiff, dr))
1214                 item += 1
1215                 sumdr += dr * 1.0
1216                 sumw += 1.0
1217         elif typ == _dt_state:
1218             st1 = msg1.get(part) and part or ""
1219             st2 = msg2.get(part) and part or ""
1220             wdiff, dr = word_diff(st1, st2, diffr=True)
1221             part_diffs.append((part, None, wdiff, dr))
1222             sumdr += dr * 1.0
1223             sumw += 1.0
1224         else:
1225             raise PologyError(
1226                 _("@info",
1227                   "Unhandled message part '%(part)s' encountered "
1228                   "while differencing.",
1229                   part=part))
1230
1231     if diffr:
1232         dr = sumw and sumdr / sumw or 0.0
1233         return part_diffs, dr
1234     else:
1235         return [x[:3] for x in part_diffs]
1236
1237
1238 _dcmnt_field = "auto_comment" # to use manual_comment would be bad idea
1239 _dcmnt_head = "ediff:"
1240 _dcmnt_head_esc = "~" # must be single character
1241 _dcmnt_sep = ", "
1242 _dcmnt_asep = " "
1243 _dcmnt_ind_state = "state"
1244 _dcmnt_ind_ctxtpad = "ctxtpad"
1245 _dcmnt_ind_infsep = "infsep"
1246 _dcmnt_all_inds = ( # ordered
1247     _dcmnt_ind_state, _dcmnt_ind_ctxtpad, _dcmnt_ind_infsep,
1248 )
1249 _ctxtpad_sep = "|"
1250 _ctxtpad_noctxt = "~"
1251 _ctxtpad_alnums = "abcdefghijklmnopqrstuvwxyz0123456789"
1252 _infsep_blk = "~="
1253 _infsep_minlen = 20
1254
1255 def msg_ediff (msg1, msg2, pfilter=None, addrem=None,
1256                emsg=None, ecat=None, eokpos=None, enoctxt=None,
1257                emptydc=False, colorize=False, diffr=False):
1258     """
1259     Create word-level embedded difference between extraction-invariant
1260     parts of messages.
1261
1262     Like L{msg_diff}, but instead of difference list the result is a message
1263     with embedded differences, of the kind produced by L{word_ediff}.
1264     See L{msg_diff} for description C{pfilter} and C{addrem} parameters,
1265     and L{word_ediff} for the format of embedded differences.
1266     Additionally, if C{pfilter} is given, C{msgstr} fields will be diffed
1267     both with and without the filter, and if the two diffs are not equal,
1268     both embeddings are going to be presented in the field,
1269     suitably visually separated.
1270
1271     By default, a new message with embedded difference will be constructed,
1272     of the type of first non-None of C{msg2} and C{msg1}.
1273     Alternatively, the difference can be embedded into the message supplied
1274     by C{emsg} parameter.
1275
1276     If resulting messages with embedded differences are to be inserted
1277     into a catalog, that catalog can be given by the C{ecat} parameter.
1278     Then, if the key of the resulting message would conflict one of
1279     those already in the catalog, its context will be appropriately padded
1280     to avoid the conflict.
1281     This is done by adding a pipe character and an unspecified number
1282     of alphanumerics (generally junk-looking) to the end of the C{msgctxt}.
1283     In case the conflict with a particular message in the catalog is
1284     acceptable (e.g. when resulting message is to be inserted in its place),
1285     the position of this message can be given by the C{eokpos} parameter.
1286     In case a certain value of C{msgctxt} should be padded regardless
1287     of whether there is a conflict or not,
1288     this value can be given by C{enoctxt} parameter.
1289
1290     An additional automatic comment starting with C{ediff:}
1291     may be added to the message, possibly followed by some indicators
1292     necessary to complete the difference specification. These include:
1293
1294       - C{state <STATE_DIFF> ...}: changes in message state, like
1295         C{obsolete} and C{fuzzy}; e.g. C{state {+obsolete+}} means
1296         that the message has been obsoleted from C{msg1} to C{msg2},
1297         while C{state {-obsolete-}} means that it has been was revived.
1298
1299       - C{ctxtpad <STRING>}: padding alphanumerics added to the C{msgctxt}
1300         field to avoid key collision with one of the messages from C{ecat}.
1301
1302       - C{infsep <BLOCK> <LENGTH>}: if C{pfilter} was used, this indicator
1303         states the building block and length in blocks of in-field separators.
1304
1305     By default the difference comment is not added if there are no indicators,
1306     but it may be forced by setting C{emptydc} parameter to C{True}.
1307
1308     Embedded differences can be additionally colorized (e.g. for terminal)
1309     by setting C{colorize} parameter to C{True}.
1310
1311     If C{diffr} is C{True}, aside from the message with embedded differences,
1312     the total difference ratio is returned (see L{msg_diff}).
1313     If C{pfilter} is given, the ratio refers to difference under filter.
1314
1315     @param msg1: the message from which to make the difference
1316     @type msg1: L{Message_base<message.Message_base>} or None
1317     @param msg2: the message to which to make the difference
1318     @type msg2: L{Message_base<message.Message_base>} or None
1319     @param pfilter: filter to be applied to translation prior to differencing
1320     @type pfilter: callable
1321     @param addrem: report equal, added or removed segments instead of
1322         full difference, joined by what follows the selection character
1323     @type addrem: string
1324     @param emsg: message to embedd the difference to
1325     @type emsg: L{Message_base<message.Message_base>}
1326     @param ecat: catalog of messages to avoid key conflict with
1327     @type ecat: L{Catalog<catalog.Catalog>}
1328     @param eokpos: position into C{ecat} where key conflict is ignored
1329     @type eokpos: int
1330     @param enoctxt: C{msgctxt} string that should be padded unconditionally
1331     @type enoctxt: string
1332     @param emptydc: whether to add difference comment even if empty
1333     @type emptydc: bool
1334     @param colorize: whether to colorize the difference
1335     @type colorize: bool
1336     @param diffr: whether to report difference ratio
1337     @type diffr: bool
1338
1339     @return: message with embedded differences (or None)
1340         and possibly difference ratio
1341     @rtype: type(emsg or msg2 or msg1 or None) or (type(~), float)
1342     """
1343
1344     if msg1 is None and msg2 is None:
1345         return not diffr and (None, 0.0) or None
1346
1347     # Compute the difference.
1348     wdiffs, totdr = msg_diff(msg1, msg2, addrem=addrem, diffr=True)
1349     wdiffs_pf = []
1350     if pfilter:
1351         wdiffs_pf, totdr = msg_diff(msg1, msg2, pfilter=pfilter,
1352                                     addrem=addrem, diffr=True)
1353
1354     # Construct list of embedded diffs out of original difference list.
1355     if not addrem:
1356         mtoe = lambda x: (x[0], x[1], _assemble_ediff(x[2], colorize), x[3])
1357         ediffs = list(map(mtoe, wdiffs))
1358         ediffs_pf = list(map(mtoe, wdiffs_pf))
1359     else:
1360         ediffs = wdiffs
1361         ediffs_pf = wdiffs_pf
1362
1363     # Construct the message to embed differences into.
1364     if emsg is None:
1365         tmsg = msg2 or msg1
1366         emsg = type(tmsg)()
1367         for part, typ in _msg_diff_parts:
1368             tval = tmsg.get(part)
1369             if tval is not None:
1370                 setattr(emsg, part, type(tval)(tval))
1371
1372     # Indicators for the difference comment.
1373     indargs = {}
1374
1375     # Determine field separator for raw/filtered differences.
1376     if ediffs_pf:
1377         infseplen = _infsep_minlen
1378         infsepinc = 5
1379         infseplen_p = infseplen - 1
1380         while infseplen_p < infseplen:
1381             infsep = _infsep_blk * infseplen
1382             infseplen_p = infseplen
1383             for part, item, ediff, dr in ediffs + ediffs_pf:
1384                 if ediff and infsep in ediff:
1385                     infseplen += infsepinc
1386                     break
1387         indargs[_dcmnt_ind_infsep] = [_infsep_blk, str(infseplen)]
1388
1389     # Embed differences.
1390     for i in range(len(ediffs)):
1391         part, item, ediff, dr = ediffs[i]
1392         typ = _msg_dpart_types[part]
1393         if typ == _dt_single:
1394             setattr(emsg, part, ediff)
1395         elif typ == _dt_list:
1396             lst = emsg.get(part)
1397             lst.extend([""] * (item + 1 - len(lst)))
1398             if ediffs_pf:
1399                 ediff_pf = ediffs_pf[i][2]
1400                 if ediff_pf and ediff_pf != ediff:
1401                     ediff += "\n" + infsep + "\n" + ediff_pf
1402             lst[item] = ediff
1403         elif typ == _dt_state:
1404             stag, spart = wdiffs[i][2][0]
1405             if stag != _equ_tag:
1406                 if _dcmnt_ind_state not in indargs:
1407                     indargs[_dcmnt_ind_state] = []
1408                 indargs[_dcmnt_ind_state].append(ediff)
1409             sval = bool(stag in (_new_tag, _equ_tag) and spart)
1410             setattr(emsg, part, sval)
1411         else:
1412             raise PologyError(
1413                 _("@info",
1414                   "Unhandled message part '%(part)s' encountered "
1415                   "while differencing.",
1416                   part=part))
1417
1418     # Pad context to avoid conflicts.
1419     if (   (ecat is not None and emsg in ecat and ecat.find(emsg) != eokpos)
1420         or (enoctxt is not None and emsg.msgctxt == enoctxt)
1421     ):
1422         noctxtind = emsg.msgctxt is None and _ctxtpad_noctxt or ""
1423         octxt = emsg.msgctxt or ""
1424         while True:
1425             padding = "".join([random.choice(_ctxtpad_alnums)
1426                                for x in range(5)])
1427             emsg.msgctxt = octxt + _ctxtpad_sep + padding + noctxtind
1428             if (    emsg not in ecat
1429                 and (enoctxt is None or emsg.msgctxt != enoctxt)
1430             ):
1431                 break
1432         indargs[_dcmnt_ind_ctxtpad] = [padding]
1433
1434     # If any of the existing comments looks like diff comment, escape it.
1435     ecomments = emsg.get(_dcmnt_field)
1436     for i in range(len(ecomments)):
1437         scmnt = ecomments[i].strip()
1438         p = scmnt.find(_dcmnt_head)
1439         if p >= 0 and scmnt[:p] == _dcmnt_head_esc * p:
1440             nwp = 0
1441             while scmnt[nwp].isspace():
1442                 nwp += 1
1443             ecomments[i] = scmnt[:nwp] + _dcmnt_head_esc + scmnt[nwp:]
1444
1445     # Add diff comment.
1446     if indargs or emptydc:
1447         inds = []
1448         for ind in _dcmnt_all_inds: # to have deterministic ordering
1449             alst = indargs.get(ind)
1450             if alst is not None:
1451                 inds.append(cjoin([ind] + alst, _dcmnt_asep))
1452         dcmnt = _dcmnt_head
1453         if inds:
1454             dcmnt += " " + cjoin(inds, _dcmnt_sep)
1455         ecomments.insert(0, dcmnt)
1456
1457     return diffr and (emsg, totdr) or emsg
1458
1459
1460 def msg_ediff_to_new (emsg, rmsg=None):
1461     """
1462     Resolve message with embedded difference to the newer message.
1463
1464     Message cannot be properly resolved if C{addrem} parameter
1465     to L{msg_ediff} was used on embedding.
1466     If this function is called on such a message, the result is undefined.
1467
1468     By default a new message object is created, but using the C{rmsg}
1469     parameter, en existing message can be given to be filled with all
1470     the resolved parts (keeping its own, ignored parts). This message can
1471     be the C{emsg} itself.
1472
1473     If the resolved message evaluates to no message, the function
1474     returns C{None}, and C{rmsg} is not touched if it was given.
1475
1476     Any states indicated as added by the difference comment are ignored
1477     in favor of the actual states of embedded difference message.
1478     The two sets should normally be equal, but if they are not,
1479     the actual state in effect overrides the indicated added state.
1480
1481     @param emsg: resolvable message with embedded differences
1482     @type emsg: L{Message_base<message.Message_base>} or None
1483     @param rmsg: message to fill in the resolved parts
1484     @type rmsg: L{Message_base<message.Message_base>}
1485
1486     @return: resolved message (or None)
1487     @rtype: type of first non-None of rmsg, emsg, or None
1488     """
1489
1490     return _msg_ediff_to_x(emsg, rmsg, new=True)
1491
1492
1493 def msg_ediff_to_old (emsg, rmsg=None):
1494     """
1495     Resolve message with embedded difference to the older message.
1496
1497     Like L{msg_ediff_to_new}, only constructing the opposite message
1498     (except that states indicated as removed by difference comment are
1499     never ignored, i.e. they always override actual states).
1500     See L{msg_ediff_to_new} for parameters and return values.
1501     """
1502
1503     return _msg_ediff_to_x(emsg, rmsg, new=False)
1504
1505
1506 def _msg_ediff_to_x (emsg, rmsg, new):
1507
1508     if new:
1509         word_ediff_to_x = word_ediff_to_new
1510         word_ediff_to_o = word_ediff_to_old
1511         line_ediff_to_x = line_ediff_to_new
1512         ignore_state_diff = True
1513     else:
1514         word_ediff_to_x = word_ediff_to_old
1515         word_ediff_to_o = word_ediff_to_new
1516         line_ediff_to_x = line_ediff_to_old
1517         ignore_state_diff = False
1518
1519     # Work on copy if target message not given.
1520     if rmsg is None:
1521         rmsg = type(emsg)(emsg)
1522
1523     # Since rmsg can be emsg itself, collect all attributes to set,
1524     # and set them in the end.
1525     atts_vals = []
1526
1527     # Parse everything out of diff comment,
1528     # unescape comments which looked like diff comment and were escaped.
1529     states = {}
1530     ctxtpad = None
1531     infsep = None
1532     cmnts = []
1533     for cmnt in list(emsg.get(_dcmnt_field)):
1534         scmnt = cmnt.strip()
1535         p = scmnt.find(_dcmnt_head)
1536         if p == 0:
1537             dcmnt = scmnt[len(_dcmnt_head):]
1538             # FIXME: Checks for unknown indicators and bad arguments.
1539             for indargs in dcmnt.split(_dcmnt_sep.strip()):
1540                 lst = indargs.strip().split(_dcmnt_asep)
1541                 ind, args = lst[0], [word_ediff_to_x(x) for x in lst[1:]]
1542                 if 0: pass
1543                 elif ind == _dcmnt_ind_state:
1544                     for arg in args:
1545                         if _msg_dpart_types.get(arg) == _dt_state:
1546                             states[arg] = True
1547                     args_o = [word_ediff_to_o(x) for x in lst[1:]]
1548                     for arg in args_o:
1549                         if _msg_dpart_types.get(arg) == _dt_state:
1550                             states[arg] = False
1551                 elif ind == _dcmnt_ind_ctxtpad:
1552                     ctxtpad = args[0]
1553                 elif ind == _dcmnt_ind_infsep:
1554                     infsep = args[0] * int(args[1])
1555         else:
1556             if p > 0 and scmnt[:p] == _dcmnt_head_esc * p:
1557                 nwp = 0
1558                 while cmnt[nwp].isspace():
1559                     nwp += 1
1560                 cmnt = cmnt[:nwp] + cmnt[nwp + 1:]
1561             cmnts.append(cmnt)
1562
1563     listtype = type(rmsg.msgstr)
1564
1565     # Put back cleaned comments.
1566     atts_vals.append((_dcmnt_field, listtype(cmnts)))
1567
1568     # Remove context padding.
1569     if ctxtpad:
1570         val = emsg.get("msgctxt")
1571         p = val.rfind(ctxtpad or "")
1572         if (   p < 0
1573             or val[p - len(_ctxtpad_sep):p] != _ctxtpad_sep
1574             or val[p + len(ctxtpad):] not in (_ctxtpad_noctxt, "")
1575         ):
1576             raise PologyError(_("@info", "Malformed padded context."))
1577         if val[p + len(ctxtpad):] != _ctxtpad_noctxt:
1578             val = val[:p - len(_ctxtpad_sep)]
1579         else:
1580             val = None
1581         msgctxt_nopad = val
1582
1583     # Resolve parts.
1584     for part, typ in _msg_diff_parts:
1585         if ctxtpad and part == "msgctxt":
1586             val = msgctxt_nopad
1587         else:
1588             val = emsg.get(part)
1589         if typ == _dt_single:
1590             nval = word_ediff_to_x(val)
1591             if nval == None and part == "msgid":
1592                 return None
1593             atts_vals.append((part, nval))
1594         elif typ == _dt_list:
1595             lst = []
1596             for el in val:
1597                 if infsep:
1598                     p = el.find(infsep)
1599                     if p >= 0: # strip filtered difference
1600                         el = el[:p - 1] # -1 to remove newline
1601                 lst.append(el)
1602             nlst = listtype(line_ediff_to_x(lst))
1603             if nlst == [] and part == "msgstr":
1604                 return None
1605             atts_vals.append((part, nlst))
1606         elif typ == _dt_state:
1607             if not ignore_state_diff:
1608                 val = states.get(part)
1609                 if val is not None:
1610                     atts_vals.append((part, val))
1611         else:
1612             raise PologyError(
1613                 _("@info",
1614                   "Unhandled message part '%(part)s' encountered "
1615                   "while resolving difference.",
1616                   part=part))
1617
1618     # Set resolved parts for real.
1619     for att, val in atts_vals:
1620         setattr(rmsg, att, val)
1621
1622     return rmsg
1623
1624
1625 def editprob (oldtext, newtext):
1626     """
1627     Compute the probability that a human would rather edit the old text
1628     to obtain the new text, then write it from scratch.
1629
1630     Classical algorithms to compute similarity ratio between two texts
1631     sometimes produce high ratios for texts which a human would unlikely
1632     consider similar enough to make one text by editing the other,
1633     and vice versa.
1634     This functions uses some heuristics to derive the probability
1635     that one text was really edited by a human into the other.
1636
1637     Not commutative in general.
1638
1639     If one of the texts is given as C{None}, the result is 0.0;
1640     if both are C{None}, the result is 1.0.
1641
1642     @param oldtext: candidate for initial text
1643     @type oldtext: string
1644     @param newtext: current text
1645     @type newtext: string
1646
1647     @returns: the probability of editing the old into the new text [0, 1]
1648     @rtype: float
1649     """
1650
1651     if oldtext == newtext:
1652         return 1.0
1653     if not oldtext or not newtext:
1654         return 0.0
1655
1656     # Consider always the case of editing from longer to shorter text.
1657     if len(oldtext) < len(newtext):
1658         shorttext, longtext = oldtext, newtext
1659     else:
1660         shorttext, longtext = newtext, oldtext
1661
1662     # Construct diff.
1663     sm = SequenceMatcher(None, longtext, shorttext)
1664     mblocks = sm.get_matching_blocks()
1665     mblocks = sorted(mblocks, key=lambda x: x[1])
1666     mblocks.insert(0, (0, 0, 0))
1667
1668     # Acummulate probability.
1669     ep = 0.0
1670     for i in range(1, len(mblocks) - 1):
1671         lm = mblocks[i][2]
1672         ld1 = mblocks[i][1] - (mblocks[i - 1][1] + mblocks[i - 1][2])
1673         ld2 = mblocks[i + 1][1] - (mblocks[i][1] + mblocks[i][2])
1674         cf = (float(lm) / (lm + ld1 + ld2))**2
1675         # ...if cf would be set to 1, probability would be equal
1676         # to ordinary similarity ratio.
1677         ep += lm * cf
1678     ep /= len(shorttext)
1679
1680     # Correct for different lengths of texts.
1681     rl = float(len(shorttext)) / len(longtext)
1682     ep *= 1 - (rl - 1)**4
1683
1684     return ep
1685
1686
1687 def descprob (descpath, ancpath, cutoff=None, getcsz=False):
1688     """
1689     Compute the probability that one PO file is a descendant of another.
1690
1691     Sometimes PO files are renamed, split into two, joined into one,
1692     also with possible small changes in messages between old and new set.
1693     This functions uses some heuristics to derive the probability
1694     that the PO file given by C{apath} is an ancestor of the PO file
1695     given by C{dpath}.
1696     If the probability cannot be determined (for whatever reason,
1697     e.g. if the file contains syntax errors), C{None} is returned.
1698
1699     By default, only equality versus non-equality of messages is
1700     taken into consideration. If C{cutoff} is set to a number 0.0-1.0,
1701     then fuzzy matching is performed, and partial similarities greater
1702     than the cutoff are counted into the final probability.
1703     However, this reduces performance by orders of magnitude
1704     (the more the lower the cutoff; 0.7-0.8 may be a reasonable tradeoff).
1705
1706     @param descpath: path to possible descendent PO file
1707     @type descpath: string
1708     @param ancpath: path to possible ancestor PO file
1709     @type ancpath: string
1710     @param cutoff: the cuttoff for fuzzy matching
1711     @type cutoff: float
1712     @param getcsz: also report the referent character sizes
1713         of the first and second file
1714     @type getcsz: bool
1715
1716     @returns: the probability of ancestry [0, 1],
1717         the referent character sizes if requested
1718     @rtype: C{None} or float or (float, int, int)
1719     """
1720
1721     # Read representative texts of messages.
1722     # Ignore non-unique texts (contexts may have been stripped).
1723     dtexts = set(_read_msg_texts(descpath))
1724     atexts = set(_read_msg_texts(ancpath))
1725
1726     # Make the computation commutative, by always taking
1727     # the file with less text as possible descendant.
1728     dtotchar = sum(len(t) for t in dtexts)
1729     atotchar = sum(len(t) for t in atexts)
1730     if getcsz:
1731         dtotchar_orig = dtotchar
1732         atotchar_orig = atotchar
1733     if dtotchar > atotchar:
1734         dtexts, atexts = atexts, dtexts
1735         dtotchar, atotchar = atotchar, dtotchar
1736
1737     # Count how many texts from descendant are in ancestor too.
1738     # This gives basic probability.
1739     neq = len(dtexts.intersection(atexts))
1740     prob = float(neq) / len(dtexts)
1741
1742     # For each text in descendant not found in ancestor,
1743     # sum similarity ratios to nearest text in ancestor,
1744     # and add to the probability.
1745     if cutoff is not None:
1746         sumsim = 0.0
1747         for dt in dtexts.difference(atexts):
1748             seqm = SequenceMatcher()
1749             seqm.set_seq2(dt)
1750             maxsim = 0.0
1751             for at in atexts:
1752                 seqm.set_seq1(at)
1753                 sim = seqm.real_quick_ratio()
1754                 if sim > cutoff:
1755                     sim = seqm.quick_ratio()
1756                     if sim > cutoff:
1757                         sim = seqm.ratio()
1758                         if sim > cutoff:
1759                             maxsim = max(maxsim, sim)
1760             sumsim += maxsim
1761         prob += sumsim / len(dtexts)
1762
1763     # Correct probability for small files.
1764     # This is necessary due to enforced commutativity above.
1765     limtotchar = 100 # e.g. 10 messages with 2 words (10 characters) each
1766     if dtotchar < limtotchar:
1767         prob *= (float(dtotchar) / atotchar)**0.5
1768
1769     if getcsz:
1770         return prob, dtotchar_orig, atotchar_orig
1771     else:
1772         return prob
1773
1774
1775 def _read_msg_texts (path):
1776
1777     # NOTE: This function needs to be as fast as possible,
1778     # so instead of using file.Catalog, the file is manually parsed
1779     # to the necessary minimum.
1780     # It is more important to be fast than correct,
1781     # so parsing ignores some valid but highly unusual PO formatting.
1782
1783     # NOTE: Intentionally ignoring: file encoding, escaping, msgctxt.
1784
1785     try:
1786         lines = open(path).readlines()
1787     except:
1788         raise PologyError(
1789             _("@info",
1790               "Cannot read file '%(file)s'.",
1791               file=path))
1792
1793     msgids = []
1794     inmsgid = False
1795     for line in lines:
1796         line = line.strip()
1797         if line.startswith("msgid "):
1798             segs = []
1799             line = line[5:].strip()
1800             inmsgid = True
1801         elif not line.startswith("\""):
1802             if inmsgid:
1803                 msgid = "".join(segs)
1804                 msgids.append(msgid)
1805                 inmsgid = False
1806         if inmsgid:
1807             segs.append(line[1:-1]) # strip quotes
1808
1809     return msgids
1810