File indexing completed on 2024-05-12 05:47:03

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Process letter accents in Serbian Cyrillic text.
0005 
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009 
0010 # All accented letters in Serbian Cyrillic, for a given non-accented letter.
0011 _accents = {
0012     "а": ("а̀", "а́", "а̏", "а̑", "а̄", "а̂", "â", "ȃ"),
0013     "А": ("А̀", "А́", "А̏", "А̑", "А̄", "А̂", "Â", "Ȃ"),
0014     # ...with Latin long-falling/genitive a in NFC, used sometimes as makeshift
0015     "е": ("ѐ", "е́", "е̏", "е̑", "е̄", "е̂", "ѐ"),
0016     "Е": ("Ѐ", "Е́", "Е̏", "Е̑", "Е̄", "Е̂", "Ѐ"),
0017     "и": ("ѝ", "и́", "и̏", "и̑", "ӣ", "и̂", "ѝ", "ӣ"),
0018     "И": ("Ѝ", "И́", "И̏", "И̑", "Ӣ", "И̂", "Ѝ", "Ӣ"),
0019     "о": ("о̀", "о́", "о̏", "о̑", "о̄", "о̂", "ȏ", "ô"),
0020     "О": ("О̀", "О́", "О̏", "О̑", "О̄", "О̂", "Ȏ", "Ô"),
0021     # ...with Latin long-falling/genitive o in NFC, used sometimes as makeshift
0022     "у": ("у̀", "у́", "у̏", "у̑", "ӯ", "у̂", "ӯ"),
0023     "У": ("У̀", "У́", "У̏", "У̑", "Ӯ", "У̂", "Ӯ"),
0024     "р": ("р̀", "р́", "р̏", "р̑", "р̄", "р̂"),
0025     "Р": ("Р̀", "Р́", "Р̏", "Р̑", "Р̄", "Р̂"),
0026 }
0027 
0028 # All accented letters bunched together,
0029 # and inverted mapping (base for each accented letter).
0030 _accents_flat = set()
0031 _accents_inverted = {}
0032 for base, accents in list(_accents.items()):
0033     _accents_flat.update(set(accents))
0034     for accent in accents:
0035         _accents_inverted[accent] = base
0036 del base, accents # do not pollute exports
0037 
0038 _max_accent_len = max(list(map(len, list(_accents_flat))))
0039 _min_accent_len = min(list(map(len, list(_accents_flat))))
0040 _accent_len_range = list(range(_max_accent_len, _min_accent_len - 1, -1))
0041 
0042 # FIXME: The graphing sequences with slashes and backslashes are far
0043 # too easy to happen accidentally; think of something better.
0044 _agraphs_unused = {
0045     r"\а": r"а̀",
0046     r"/а": r"а́",
0047     r"\\а": r"а̏",
0048     r"//а": r"а̑",
0049     r"~а": r"а̄",
0050     r"\А": r"А̀",
0051     r"/А": r"А́",
0052     r"\\А": r"А̏",
0053     r"//А": r"А̑",
0054     r"~А": r"А̄",
0055 
0056     r"\е": r"ѐ",
0057     r"/е": r"е́",
0058     r"\\е": r"е̏",
0059     r"//е": r"е̑",
0060     r"~е": r"е̄",
0061     r"\Е": r"Ѐ",
0062     r"/Е": r"Е́",
0063     r"\\Е": r"Е̏",
0064     r"//Е": r"Е̑",
0065     r"~Е": r"Е̄",
0066 
0067     r"\и": r"ѝ",
0068     r"/и": r"и́",
0069     r"\\и": r"и̏",
0070     r"//и": r"и̑",
0071     r"~и": r"ӣ",
0072     r"\И": r"Ѝ",
0073     r"/И": r"И́",
0074     r"\\И": r"И̏",
0075     r"//И": r"И̑",
0076     r"~И": r"Ӣ",
0077 
0078     r"\о": r"о̀",
0079     r"/о": r"о́",
0080     r"\\о": r"о̏",
0081     r"//о": r"о̑",
0082     r"~о": r"о̄",
0083     r"\О": r"О̀",
0084     r"/О": r"О́",
0085     r"\\О": r"О̏",
0086     r"//О": r"О̑",
0087     r"~О": r"О̄",
0088 
0089     r"\у": r"у̀",
0090     r"/у": r"у́",
0091     r"\\у": r"у̏",
0092     r"//у": r"у̑",
0093     r"~у": r"ӯ",
0094     r"\У": r"У̀",
0095     r"/У": r"У́",
0096     r"\\У": r"У̏",
0097     r"//У": r"У̑",
0098     r"~У": r"Ӯ",
0099 
0100     r"\р": r"р̀",
0101     r"/р": r"р́",
0102     r"\\р": r"р̏",
0103     r"//р": r"р̑",
0104     r"~р": r"р̄",
0105     r"\Р": r"Р̀",
0106     r"/Р": r"Р́",
0107     r"\\Р": r"Р̏",
0108     r"//Р": r"Р̑",
0109     r"~Р": r"Р̄",
0110 }
0111 
0112 _agraphs = {
0113     #ur"^а": ur"а̂",
0114     #ur"^о": ur"о̂",
0115     #ur"^А": ur"А̂",
0116     #ur"^О": ur"О̂",
0117     # ...use Latin NFC forms at places for the moment.
0118     r"^а" : r"â",
0119     r"^о" : r"ô",
0120     r"^А" : r"Â",
0121     r"^О" : r"Ô",
0122 }
0123 
0124 _max_agraph_len = max(list(map(len, list(_agraphs.keys()))))
0125 _min_agraph_len = min(list(map(len, list(_agraphs.keys()))))
0126 _agraph_len_range = list(range(_max_agraph_len, _min_agraph_len - 1, -1))
0127 
0128 
0129 def resolve_agraphs (text):
0130     """
0131     Convert accent graphs into real accented letters [type F1A hook].
0132 
0133     Accented Cyrillic letters still cannot be widely entered directly
0134     by keyboard, and in such cases this module allows converting graphical
0135     accent-letter representations into actual Unicode compositions.
0136 
0137     @note: At the moment, only genitive endings are supported.
0138 
0139     @return: text
0140     """
0141 
0142     return _apply_mapping(text, _agraphs, _agraph_len_range)
0143 
0144 
0145 def remove_accents (text):
0146     """
0147     Remove accents from all accented letters [type F1A hook].
0148 
0149     Sometimes it is convenient to operate on text without accents,
0150     e.g. when checking spelling.
0151 
0152     @return: text
0153     """
0154 
0155     return _apply_mapping(text, _accents_inverted, _accent_len_range)
0156 
0157 
0158 def _apply_mapping (text, mapping, mlenrange):
0159 
0160     p = 0
0161     pp = 0
0162     tsegs = []
0163     ltext = len(text)
0164     while p < ltext:
0165         for mlen in mlenrange:
0166             mapfrom = text[p:p + mlen]
0167             mapto = mapping.get(mapfrom)
0168             if mapto:
0169                 tsegs.append(text[pp:p])
0170                 tsegs.append(mapto)
0171                 p += mlen - 1
0172                 pp = p + 1
0173                 break
0174         p += 1
0175     tsegs.append(text[pp:p])
0176 
0177     return "".join(tsegs)
0178