lang/sr/wconv.py

0001 # -*- coding: UTF-8 -*
0002
0003 """
0004 Conversions between scripts and dialects in Serbian.
0005
0006 Serbian standard literary language can be written in two dialects,
0007 Ekavian and Ijekavian, and two scripts, Cyrillic and Latin.
0008 Dialects and scripts can be freely combined, resulting in four
0009 official writing standards: Ekavian Cyrillic, Ekavian Latin,
0010 Ijekavian Cyrillic, and Ijekavian Latin.
0011 Some automatic and semi-automatic conversions between them are possible.
0012
0013
0014 Script Transliteration
0015 ======================
0016
0017 For plain text containing only Serbian words (including well adapted loans),
0018 it is trivial to transliterate from Cyrillic to Latin script.
0019 It is only necessary to take care when converting Cyrillic Љ, Њ, Џ into
0020 Latin digraphs Lj, Nj, Dž, because sometimes they should be full upper-case
0021 (e.g. Љубљана→Ljubljana, ЉУБЉАНА→LJUBLJANA).
0022 But this is easily algorithmically resolvable, by checking if
0023 the previous or the next letter are upper-case too.
0024
0025 To transliterate from Latin to Cyrillic is somewhat harder, because
0026 in rare cases digraphs nj, lj, dž may not be single, but standalone letters;
0027 i.e. they do not map Cyrillic to љ, њ, џ, but to лј, нј, дж
0028 (dablju→даблју, konjunkcija→конјункција, nadživeti→надживети).
0029 The only way to handle this is by having a dictionary of special cases.
0030
0031 Furthermore, in today's practice texts are rarely clean as assumed above.
0032 They are frequently riddled with foreign Latin phrases (such as proper names)
0033 quasiphrases (such as electronic addresses), and constructive elements
0034 (such as markup tags). On the other hand, foreign Cyrillic phrases are
0035 quite infrequent (may be found e.g. in texts on linguistic topics).
0036 This means that in practice transliteration from Cyrillic to Latin
0037 remains straightforward, but from Latin to Cyrillic decidedly not so.
0038
0039
0040 Script Hybridization
0041 ====================
0042
0043 Sometimes the result of direct transliteration from Cyrillic to Latin
0044 is against the established Latin practice in a certain field,
0045 even if valid according to official orthography.
0046 Then it becomes necessary to specially handle some parts of the text
0047 (e.g. transliterations or lack thereof of foreign proper names).
0048
0049 Alternatives directives are a way to compose "hybrid" Cyrillic-Latin text,
0050 out of which both ordinary Cyrillic and non-directly transliterated Latin
0051 texts can be automatically derived.
0052 For example, this hybrid text::
0053
0054     Различите ~@/линукс/Linux/ дистрибуције...
0055
0056 can be automatically resolved into::
0057
0058     Различите линукс дистрибуције...
0059     Različite Linux distribucije...
0060
0061 String C{~@} is the head of alternatives directive.
0062 It is followed by a single character, which is then used to delimit
0063 Cyrillic and Latin parts, in that order, out of surrounding text.
0064 (For all details on format of alternatives directives, see
0065 L{resolve_alternatives()< pology.resolve.resolve_alternatives>}).
0066 Transliteration from Cyrillic to Latin is performed only on text
0067 outside of alternatives directives.
0068
0069
0070 Dialect Hybridization
0071 =====================
0072
0073 Both Ekavian and Ijekavian dialect may be represented within single text.
0074 Such hybrid text is basically Ijekavian, but jat-reflexes are marked
0075 by inserting one of the jat-reflex ticks C{›}, C{‹}, C{▹}, C{◃}::
0076
0077     Д‹ио б‹иљежака о В›јештичјој р›ијеци.
0078
0079 Clean Ijekavian text is then obtained by just removing jat-reflex ticks
0080 preceding valid jat-reflexes, and Ekavian by applying the jat-reflex map::
0081
0082     Дио биљежака о Вјештичјој ријеци.
0083     Део бележака о Вештичјој реци.
0084
0085 The jat-reflex mapping rules are as follows, grouped by tick:
0086   - ›ије→е, ›је→е
0087   - ‹иј→еј, ‹иљ→ел, ‹ио→ео, ‹ље→ле, ‹ње→не
0088   - ▹ије→и, ▹је→и
0089   - ◃ијел→ео, ◃ијен→ењ, ◃ит→ет, ◃ил→ел, ◃јел→ео, ◃тн→тњ, ◃шње→сне
0090
0091 For very rare special cases, it is possible to directly provide
0092 different forms for Ekavian and Ijekavian, in that order,
0093 by using alternatives directive::
0094
0095     Гд›је с' ~#/то/ба/ пошо̑?
0096
0097 Compared to alternatives directives for scripts, the only difference is
0098 that here the directive head is C{~#}.
0099 Alternatives directives for script and dialect can thus be mixed
0100 without conflicts, in single text and even interwoven
0101 (when interweaving, different delimiters must be used).
0102
0103
0104 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0105 @license: GPLv3
0106 """
0107
0108 from pology import PologyError, _, n_
0109 from pology.diff import word_diff, tdiff
0110 from pology.report import warning, format_item_list
0111 from pology.resolve import resolve_alternatives_simple
0112 from pology.resolve import resolve_alternatives
0113
0114
0115 # Transliteration table Serbian Cyrillic->Latin.
0116 _dict_c2l = {
0117     'а':'a', 'б':'b', 'в':'v', 'г':'g', 'д':'d', 'ђ':'đ',
0118     'е':'e', 'ж':'ž', 'з':'z', 'и':'i', 'ј':'j', 'к':'k',
0119     'л':'l', 'љ':'lj','м':'m', 'н':'n', 'њ':'nj','о':'o',
0120     'п':'p', 'р':'r', 'с':'s', 'т':'t', 'ћ':'ć', 'у':'u',
0121     'ф':'f', 'х':'h', 'ц':'c', 'ч':'č', 'џ':'dž','ш':'š',
0122     'А':'A', 'Б':'B', 'В':'V', 'Г':'G', 'Д':'D', 'Ђ':'Đ',
0123     'Е':'E', 'Ж':'Ž', 'З':'Z', 'И':'I', 'Ј':'J', 'К':'K',
0124     'Л':'L', 'Љ':'Lj','М':'M', 'Н':'N', 'Њ':'Nj','О':'O',
0125     'П':'P', 'Р':'R', 'С':'S', 'Т':'T', 'Ћ':'Ć', 'У':'U',
0126     'Ф':'F', 'Х':'H', 'Ц':'C', 'Ч':'Č', 'Џ':'Dž','Ш':'Š',
0127     # accented NFC:
0128     'ѐ':'è', 'ѝ':'ì', 'ӣ':'ī', 'ӯ':'ū',
0129     'Ѐ':'È', 'Ѝ':'Ì', 'Ӣ':'Ī', 'Ӯ':'Ū',
0130     # frequent accented from NFD to NFC (keys now 2-char):
0131     'а̂':'â', 'о̂':'ô', 'а̑':'ȃ', 'о̑':'ȏ',
0132 }
0133
0134 # Transliteration table Serbian Cyrillic->ASCII, basic stripped.
0135 _dict_c2a_stripped = _dict_c2l.copy()
0136 _dict_c2a_stripped.update({
0137     'ђ':'dj', 'ж':'z', 'ћ':'c', 'ч':'c', 'џ':'dz', 'ш':'s',
0138     'Ђ':'Dj', 'Ж':'Z', 'Ћ':'C', 'Ч':'C', 'Џ':'Dz', 'Ш':'S',
0139 })
0140
0141 # Transliteration table Serbian Latin->ASCII, basic stripped.
0142 _dict_l2a_stripped = {
0143     'đ':'dj', 'ž':'z', 'ć':'c', 'č':'c', 'š':'s',
0144     'Đ':'Dj', 'Ž':'Z', 'Ć':'C', 'Č':'C', 'Š':'S',
0145 }
0146
0147 # Transliteration table Serbian any->ASCII, basic stripped.
0148 _dict_cl2a_stripped = {}
0149 _dict_cl2a_stripped.update(_dict_c2a_stripped)
0150 _dict_cl2a_stripped.update(_dict_l2a_stripped)
0151
0152 # Transliteration table English in Serbian Cyrillic->Latin, by keyboard layout.
0153 _dict_c2a_englay = _dict_c2l.copy()
0154 _dict_c2a_englay.update({
0155     'љ':'q', 'њ':'w', 'ж':'y', 'џ':'x',
0156     'Љ':'Q', 'Њ':'W', 'Ж':'Y', 'Џ':'X',
0157 })
0158
0159
0160 def  ctol (text):
0161     """
0162     Transliterate text from Cyrillic to proper Latin [type F1A hook].
0163     """
0164
0165     return _ctol_w(text, _dict_c2l)
0166
0167
0168 def cltoa (text):
0169     """
0170     Transliterate text from Cyrillic or Latin to stripped ASCII
0171     [type F1A hook].
0172     """
0173
0174     return _ctol_w(text, _dict_cl2a_stripped)
0175
0176
0177 def ectol (text):
0178     """
0179     Transliterate text from English in Cyrillic by keyboard layout
0180     to proper English [type F1A hook].
0181     """
0182
0183     return _ctol_w(text, _dict_c2a_englay)
0184
0185
0186 def _ctol_w (text, trdict):
0187
0188     # NOTE: Converted directly from C++ code,
0189     # perhaps something more efficient is possible.
0190
0191     tlen = len(text)
0192     ntext = ""
0193     for i in range(tlen):
0194         c = text[i]
0195         c2 = text[i:i+2]
0196         r = trdict.get(c2) or trdict.get(c)
0197         if r is not None:
0198             if len(r) > 1 and c.isupper() \
0199             and (   (i + 1 < tlen and text[i + 1].isupper()) \
0200                  or (i > 0 and text[i - 1].isupper())):
0201                 ntext += r.upper()
0202             else:
0203                 ntext += r
0204         else:
0205             ntext += c
0206
0207     return ntext
0208
0209
0210 # Head of alternatives directives for script.
0211 _shyb_althead = "~@"
0212
0213
0214 def hctoc (text):
0215     """
0216     Resolve hybrid Cyrillic text with script alternatives into
0217     plain Cyrillic text [type F1A hook].
0218     """
0219
0220     return resolve_alternatives_simple(text, 1, 2, althead=_shyb_althead)
0221
0222
0223 def hctol (text):
0224     """
0225     Resolve hybrid Cyrillic text with script alternatives into
0226     plain Latin text [type F1A hook].
0227     """
0228
0229     return resolve_alternatives_simple(text, 2, 2, althead=_shyb_althead,
0230                                        outfilter=ctol)
0231
0232
0233 def hctocl (htext):
0234     """
0235     Resolve hybrid Cyrillic-Latin text into clean Cyrillic and clean Latin.
0236
0237     @param htext: hybrid text
0238     @type htext: string
0239
0240     @returns: Cyrillic and Latin texts
0241     @rtype: (string, string)
0242     """
0243
0244     return hctoc(htext), hctol(htext)
0245
0246
0247 def cltoh (textc, textl, delims="/|¦", full=False):
0248     """
0249     Construct hybrid Cyrillic text out of clean Cyrillic and Latin texts.
0250
0251     Hybridization is performed by inserting alternatives directives
0252     for parts which cannot be resolved by direct transliteration.
0253     If C{full} is set to C{True}, complete texts are unconditionally
0254     wrapped into single alternatives directive.
0255
0256     @param textc: Cyrillic text
0257     @type textc: string
0258     @param textl: Latin text
0259     @type textl: string
0260     @param delims: possible delimiter characters
0261     @type delims: string
0262     @param full: whether to wraf full texts as single alternatives directive
0263     @type full: bool
0264
0265     @returns: hybrid Cyrillic text
0266     @rtype: string
0267     """
0268
0269     if not full:
0270         wdiff = word_diff(ctol(textc), textl)
0271         textc = _padc(textc)
0272         segs = []
0273         i = 0
0274         ic = 0
0275         while i < len(wdiff):
0276             tag, seg = wdiff[i]
0277             if tag == " ":
0278                 segc = textc[ic:ic + len(seg)]
0279                 segs.append(segc)
0280             else:
0281                 seg2 = wdiff[i + 1][1] if i + 1 < len(wdiff) else ""
0282                 if tag == "-":
0283                     segc = textc[ic:ic + len(seg)]
0284                     segl = seg2
0285                 else:
0286                     segc = textc[ic:ic + len(seg2)]
0287                     segl = seg
0288                 i += 1
0289                 segs.append(_shyb_althead + _delimit([segc, segl], delims))
0290             ic += len(seg)
0291             i += 1
0292         return _unpadc("".join(segs))
0293
0294     else:
0295         return _shyb_althead + _delimit([textc, textl], delims)
0296
0297     return "".join(segs)
0298
0299
0300 _padc_chr = "\u0004"
0301 _padc_alphas = ("љ", "њ", "џ", "Љ", "Њ", "Џ")
0302
0303 def _padc (text):
0304
0305     for alpha in _padc_alphas:
0306         text = text.replace(alpha, _padc_chr + alpha)
0307     return text
0308
0309 def _unpadc (text):
0310
0311     for alpha in _padc_alphas:
0312         text = text.replace(_padc_chr + alpha, alpha)
0313     return text
0314
0315
0316 # Ijekavian to Ekavian map (Latin script and letter cases derived afterwards).
0317 # All Ijekavian-Ekavian form pairs have to be unique across all groups.
0318 # Within a group, one Ijekavian form must not be in the prefix of another.
0319 _reflex_spec = (
0320     ("›", {
0321         "ије": "е",
0322         "је": "е",
0323     }),
0324     ("‹", {
0325         "иј": "еј", # гријати → грејати
0326         "иљ": "ел", # биљешка → белешка
0327         "ио": "ео", # дио → део
0328         "ље": "ле", # љето → лето
0329         "ње": "не", # гњев → гнев
0330     }),
0331     ("▹", {
0332         "ије": "и", # налијевати → наливати
0333         "је": "и", # утјецај → утицај
0334     }),
0335     ("◃", {
0336         "ијел": "ео", # бијел → бео
0337         "ијен": "ењ", # лијен → лењ
0338         "ил": "ел", # вриједила → вредела
0339         "ит": "ет", # вриједити → вредети
0340         "јел": "ео", # одјел → одео
0341         "тн": "тњ", # љетни → летњи
0342         "шње": "сне", # побјешњели → побеснели
0343     }),
0344 )
0345
0346 def _derive_reflex_specs (reflex_spec):
0347
0348     reflex_spec_dehyb = []
0349     reflex_spec_hyb = {}
0350     for tick, refmap in reflex_spec:
0351         # Derive data for dehybridization.
0352         # Derive Latin cases (must be done before other cases).
0353         refmap.update([list(map(ctol, x)) for x in list(refmap.items())])
0354         # Derive cases with first letter in uppercase.
0355         refmap.update([list(map(str.capitalize, x)) for x in list(refmap.items())])
0356         # Derive cases with all letters in uppercase.
0357         refmap.update([list(map(str.upper, x)) for x in list(refmap.items())])
0358         # Compute minimum and maximum reflex lengths.
0359         ijklen_min = min(list(map(len, list(refmap.keys()))))
0360         ijklen_max = max(list(map(len, list(refmap.keys()))))
0361         reflex_spec_dehyb.append((tick, refmap, ijklen_min, ijklen_max))
0362
0363         # Derive data for hybridization:
0364         # {(ekvlen, ijklen, btrk): {ijkfrm: [(ekvfrm, tick)...]}}
0365         for ijkfrm, ekvfrm in list(refmap.items()):
0366             # Compute backtracking from position of jat-reflex difference.
0367             btrk = 0
0368             while (    btrk < len(ijkfrm) and btrk < len(ekvfrm)
0369                    and ijkfrm[btrk] == ekvfrm[btrk]
0370             ):
0371                 btrk += 1
0372             pkey = (btrk, len(ekvfrm), len(ijkfrm))
0373             if pkey not in reflex_spec_hyb:
0374                 reflex_spec_hyb[pkey] = {}
0375             if ijkfrm not in reflex_spec_hyb[pkey]:
0376                 reflex_spec_hyb[pkey][ijkfrm] = []
0377             reflex_spec_hyb[pkey][ijkfrm].append((ekvfrm, tick))
0378
0379     # Convert hybridization data into list of pairs.
0380     # Sort such that on hybridization reflexes are tried by
0381     # increasing backtrack,
0382     # decreasing smaller length of the two reflexes,
0383     # decreasing greater lenght of the two reflexes.
0384     tmplst = []
0385     pkeys = list(reflex_spec_hyb.keys())
0386     pkeys.sort(key=lambda x: (x[0], -min(x[1], x[2]), -max(x[1], x[2])))
0387     reflex_spec_hyb = [k + (reflex_spec_hyb[k],) for k in pkeys]
0388
0389     return reflex_spec_dehyb, reflex_spec_hyb
0390
0391 _reflex_spec_dehyb, _reflex_spec_hyb = _derive_reflex_specs(_reflex_spec)
0392
0393 # Head of alternatives directives for dialect.
0394 _dhyb_althead = "~#"
0395
0396
0397 def hitoe (text):
0398     """
0399     Resolve hybrid Ijekavian text with jat-reflex ticks and dialect alternatives
0400     into plain Ekavian text [type F1A hook].
0401     """
0402
0403     return _hito_w(text)
0404
0405
0406 def hitoeq (text):
0407     """
0408     Like L{hitoe}, but does not output warnings on problems [type F1A hook].
0409     """
0410
0411     return _hito_w(text, silent=True)
0412
0413
0414 def hitoi (text):
0415     """
0416     Resolve hybrid Ijekavian text with jat-reflex ticks and dialect alternatives
0417     into plain Ijekavian text [type F1A hook].
0418     """
0419
0420     return _hito_w(text, toijek=True)
0421
0422
0423 def hitoiq (text):
0424     """
0425     Like L{hitoi}, but does not output warnings on problems [type F1A hook].
0426     """
0427
0428     return _hito_w(text, toijek=True, silent=True)
0429
0430
0431 def _hito_w (text, toijek=False, silent=False, validate=False):
0432
0433     errspans = [] if validate else None
0434     for tick, refmap, ijklen_min, ijklen_max in _reflex_spec_dehyb:
0435         text = _hito_w_simple(text, tick, refmap, ijklen_min, ijklen_max,
0436                               toijek, silent, errspans)
0437
0438     srcname = "<text>" if (not silent and not validate) else None
0439     selalt = 1 if not toijek else 2
0440     text, ngood, allgood = resolve_alternatives(text, selalt, 2,
0441                                                 althead=_dhyb_althead,
0442                                                 srcname=srcname)
0443     if not allgood and validate:
0444         errmsg = n_("@info \"alternatives directive\" is a term",
0445                     "Malformed Ekavian-Ijekavian alternatives directive "
0446                     "encountered after %(num)d good directive.",
0447                     "Malformed Ekavian-Ijekavian alternatives directive "
0448                     "encountered after %(num)d good directives.",
0449                     num=ngood)
0450         errspans.append((0, 0, errmsg))
0451
0452     if not validate:
0453         return text
0454     else:
0455         return errspans
0456
0457
0458 def _hito_w_simple (text, tick, refmap, ijklen_min, ijklen_max,
0459                     toijek, silent, errspans):
0460
0461     segs = []
0462     p = 0
0463     while True:
0464         pp = p
0465         p = text.find(tick, p)
0466         if p < 0:
0467             segs.append(text[pp:])
0468             break
0469         segs.append(text[pp:p])
0470         pp = p
0471         p += len(tick)
0472         if p >= len(text) or not text[p:p + 1].isalpha():
0473             segs.append(tick)
0474             continue
0475
0476         ijklen = ijklen_min
0477         ekvfrm = None
0478         while ijklen <= ijklen_max and ekvfrm is None:
0479             ijkfrm = text[p:p + ijklen]
0480             ekvfrm = refmap.get(ijkfrm)
0481             ijklen += 1
0482
0483         if ekvfrm is not None:
0484             segs.append(ekvfrm if not toijek else ijkfrm)
0485             p += len(ijkfrm)
0486         else:
0487             segs.append(tick)
0488             errmsg = _("@info \"jat\" is the name of an old Serbian letter",
0489                        "Unknown jat-reflex starting from '%(snippet)s'.",
0490                        snippet=text[pp:pp + 20])
0491             if not silent:
0492                 warning(errmsg)
0493             if errspans is not None:
0494                 errspans.append((pp, pp + ijklen_max, errmsg))
0495
0496     return "".join(segs)
0497
0498
0499 def validate_dhyb (text):
0500     """
0501     Check whether dialect-hybrid text is valid [type V1A hook].
0502     """
0503
0504     return _hito_w(text, silent=True, validate=True)
0505
0506
0507 def hitoei (htext):
0508     """
0509     Resolve hybrid Ijekavian-Ekavain text into clean Ekavian and Ijekavian.
0510
0511     @param htext: hybrid text
0512     @type htext: string
0513
0514     @returns: Ekavian and Ijekavian text
0515     @rtype: (string, string)
0516     """
0517
0518     return hitoe(htext), hitoi(htext)
0519
0520
0521 def tohi (text1, text2, ekord=None, delims="/|¦", parthyb=False):
0522     """
0523     Construct hybrid Ijekavian text out of Ekavian and Ijekavian texts.
0524
0525     Hybridization is performed by merging Ekavian and Ijekavian forms
0526     into Ijekavian forms with inserted jat-reflex ticks.
0527     Input texts can be both in Cyrillic and Latin, and piecewise so.
0528     Texts also do not have to be clean Ekavian and Ijekavian,
0529     as hybridization is performed only at difference segments.
0530     Order of text arguments is not important as long as all difference
0531     segments can be merged (i.e. the function is comutative in that case).
0532
0533     If a difference segment cannot be merged by jat-reflex ticks,
0534     then the resolution depends on C{ekord} parameter.
0535     If it is C{None}, then the segment of C{text2} is taken into result.
0536     If it is C{1} or C{2}, then the segments of C{text1} and C{text2}
0537     are combined in a dialect alternatives directive (C{~#/.../.../});
0538     the number determines which segment is put first in the directive
0539     (i.e. considered Ekavian), that of C{text1} or of C{text2}.
0540     Any other value of C{ekord} leads to undefined behavior.
0541
0542     It is possible that input texts are already partially hybridized,
0543     and only some parts of them need to be additionally hybridized.
0544     Setting C{parthyb} to C{True} will tell the function to detect
0545     and skip already hybridized segments, and hybridize only the rest.
0546
0547     @param text1: first text
0548     @type text1: string
0549     @param text2: second text
0550     @type text2: string
0551     @param ekord: enumerates the text to be considered Ekavian
0552         when adding alternatives directives
0553     @type ekord: None, 1, 2
0554     @param delims: possible delimiter characters for alternatives directives
0555     @type delims: string
0556     @param parthyb: whether input texts are already partially hybridized
0557     @type parthyb: bool
0558
0559     @returns: hybrid Ijekavian text
0560     @rtype: string
0561     """
0562
0563     len1 = len(text1); len2 = len(text2)
0564     i1 = 0; i1p = 0; i2 = 0; i2p = 0
0565     segs = []
0566     while True:
0567         while i1 < len1 and i2 < len2 and text1[i1] == text2[i2]:
0568             if not parthyb:
0569                 i1 += 1
0570                 i2 += 1
0571             else:
0572                 i1 += _step_over_hyb(text1, i1)
0573                 i2 += _step_over_hyb(text2, i2)
0574         if i1 == len1 and i2 == len2:
0575             segs.append(text1[i1p:]) # same as text2[i2p:]
0576             break
0577         # Try to hybridize difference by jat-reflex ticks.
0578         tick = None
0579         for texte, texti, ie, ii, order12 in (
0580             (text1, text2, i1, i2, True),
0581             (text2, text1, i2, i1, False),
0582         ):
0583             frms = []
0584             for btrk, lene, leni, refmap in _reflex_spec_hyb:
0585                 ieb = ie - btrk
0586                 iib = ii - btrk
0587                 if ieb < 0 or iib < 0:
0588                     continue
0589                 frme = texte[ieb:ieb + lene]
0590                 frmi = texti[iib:iib + leni]
0591                 for cfrme, ctick in refmap.get(frmi, []):
0592                     if cfrme == frme:
0593                         tick = ctick
0594                         break
0595                 if tick: break
0596             if tick: break
0597         if tick:
0598             # Hybridization by difference marks possible.
0599             segs.append(text1[i1p:i1 - btrk]) # same as text2[i2p:i2 - btrk]
0600             segs.append(tick + frmi)
0601             i1p = i1 - btrk + (lene if order12 else leni)
0602             i2p = i2 - btrk + (leni if order12 else lene)
0603         else:
0604             # Hybridization by difference marks not possible.
0605             # Use alternatives directive, or pure Ijekavian.
0606             i1b = i1; i2b = i2
0607             while (    i1b > i1p and i2b > i2p
0608                    and (text1[i1b - 1].isalpha() + text2[i2b - 1].isalpha() == 1)
0609             ):
0610                 i1b -= 1; i2b -= 1
0611             segs.append(text1[i1p:i1b])
0612             wdiff = word_diff(text1[i1b:], text2[i2b:])
0613             frm1s = []
0614             frm2s = []
0615             while wdiff and wdiff[0][0] != " ":
0616                 tag, seg = wdiff.pop(0)
0617                 if tag != "+":
0618                     frm1s.append(seg)
0619                 if tag != "-":
0620                     frm2s.append(seg)
0621             frm1 = "".join(frm1s)
0622             frm2 = "".join(frm2s)
0623             i1p = i1b + len(frm1)
0624             i2p = i2b + len(frm2)
0625             if ekord == 1:
0626                 segs.append(_dhyb_althead + _delimit([frm1, frm2], delims))
0627             elif ekord == 2:
0628                 segs.append(_dhyb_althead + _delimit([frm2, frm1], delims))
0629             else:
0630                 segs.append(frm2)
0631         i1 = i1p
0632         i2 = i2p
0633
0634     htext = "".join(segs)
0635
0636     return htext
0637
0638
0639 _reflex_spec_dehyb_by_tick = dict((x[0], x[1:]) for x in _reflex_spec_dehyb)
0640
0641 def _step_over_hyb (text, pos):
0642
0643     refspec = _reflex_spec_dehyb_by_tick.get(text[pos])
0644     if refspec is not None: # there is a reflex
0645         refmap, ijklen_min, ijklen_max = refspec
0646         ijklen = ijklen_min
0647         ekvfrm = None
0648         while ijklen <= ijklen_max and ekvfrm is None:
0649             ijkfrm = text[pos + 1:pos + 1 + ijklen]
0650             ekvfrm = refmap.get(ijkfrm)
0651             ijklen += 1
0652         if ekvfrm is not None:
0653             steplen = 1 + len(ijkfrm)
0654         else: # malformed reflex
0655             steplen = 1
0656     elif text.startswith(_dhyb_althead): # there is an alternatives directive
0657         if pos + len(_dhyb_althead) < len(text):
0658             sep = text[pos + len(_dhyb_althead)]
0659             pos2 = text.find(sep, pos + len(_dhyb_althead) + 1)
0660             if pos2 < len(text):
0661                 pos3 = text.find(sep, pos2 + 1)
0662                 if pos3 < len(text):
0663                     steplen = pos3 - pos
0664                 else: # malformed directive
0665                     steplen = 1
0666             else: # malformed directive
0667                 steplen = 1
0668         else: # malformed directive
0669             steplen = 1
0670     else: # there is plain text
0671         steplen = 1
0672
0673     return steplen
0674
0675
0676 def hictoec (text):
0677     """
0678     Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into
0679     clean Ekavian Cyrillic text [type F1A hook].
0680     """
0681
0682     return hctoc(hitoe(text))
0683
0684
0685 def hictoecq (text):
0686     """
0687     Like L{hictoec}, but does not output warnings on problems [type F1A hook].
0688     """
0689
0690     return hctoc(hitoeq(text))
0691
0692
0693 def hictoel (text):
0694     """
0695     Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into
0696     clean Ekavian Latin text [type F1A hook].
0697     """
0698
0699     return hctol(hitoe(text))
0700
0701
0702 def hictoic (text):
0703     """
0704     Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into
0705     clean Ijekavian Cyrillic text [type F1A hook].
0706     """
0707
0708     return hctoc(hitoi(text))
0709
0710
0711 def hictoicq (text):
0712     """
0713     Like L{hictoic}, but does not output warnings on problems [type F1A hook].
0714     """
0715
0716     return hctoc(hitoiq(text))
0717
0718
0719 def hictoil (text):
0720     """
0721     Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into
0722     clean Ijekavian Latin text [type F1A hook].
0723     """
0724
0725     return hctol(hitoi(text))
0726
0727
0728 def hictoall (htext):
0729     """
0730     Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into
0731     all four clean variants.
0732
0733     @param htext: hybrid text
0734     @type htext: string
0735
0736     @returns: Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic,
0737         and Ijekavian Latin text
0738     @rtype: (string, string, string, string)
0739     """
0740
0741     htextc = hctoc(htext)
0742     htextl = hctol(htext)
0743
0744     return hitoe(htextc), hitoe(htextl), hitoi(htextc), hitoi(htextl)
0745
0746
0747 def _delimit (alts, delims):
0748
0749     good = False
0750     for delim in delims:
0751         good = True
0752         for alt in alts:
0753             if delim in alt:
0754                 good = False
0755                 break
0756         if good:
0757             break
0758
0759     if not good:
0760         fmtalts = format_item_list(["{%s}" % x for x in alts])
0761         raise PologyError(
0762             _("@info",
0763               "No delimiter from '%(delimstr)s' can be used for "
0764               "alternatives directive containing: %(snippetlist)s.",
0765               delimstr=delims, snippetlist=fmtalts))
0766
0767     return delim + delim.join(alts) + delim
0768