File indexing completed on 2024-05-12 05:47:03
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Process letter accents in Serbian Cyrillic text. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 # All accented letters in Serbian Cyrillic, for a given non-accented letter. 0011 _accents = { 0012 "а": ("а̀", "а́", "а̏", "а̑", "а̄", "а̂", "â", "ȃ"), 0013 "А": ("А̀", "А́", "А̏", "А̑", "А̄", "А̂", "Â", "Ȃ"), 0014 # ...with Latin long-falling/genitive a in NFC, used sometimes as makeshift 0015 "е": ("ѐ", "е́", "е̏", "е̑", "е̄", "е̂", "ѐ"), 0016 "Е": ("Ѐ", "Е́", "Е̏", "Е̑", "Е̄", "Е̂", "Ѐ"), 0017 "и": ("ѝ", "и́", "и̏", "и̑", "ӣ", "и̂", "ѝ", "ӣ"), 0018 "И": ("Ѝ", "И́", "И̏", "И̑", "Ӣ", "И̂", "Ѝ", "Ӣ"), 0019 "о": ("о̀", "о́", "о̏", "о̑", "о̄", "о̂", "ȏ", "ô"), 0020 "О": ("О̀", "О́", "О̏", "О̑", "О̄", "О̂", "Ȏ", "Ô"), 0021 # ...with Latin long-falling/genitive o in NFC, used sometimes as makeshift 0022 "у": ("у̀", "у́", "у̏", "у̑", "ӯ", "у̂", "ӯ"), 0023 "У": ("У̀", "У́", "У̏", "У̑", "Ӯ", "У̂", "Ӯ"), 0024 "р": ("р̀", "р́", "р̏", "р̑", "р̄", "р̂"), 0025 "Р": ("Р̀", "Р́", "Р̏", "Р̑", "Р̄", "Р̂"), 0026 } 0027 0028 # All accented letters bunched together, 0029 # and inverted mapping (base for each accented letter). 0030 _accents_flat = set() 0031 _accents_inverted = {} 0032 for base, accents in list(_accents.items()): 0033 _accents_flat.update(set(accents)) 0034 for accent in accents: 0035 _accents_inverted[accent] = base 0036 del base, accents # do not pollute exports 0037 0038 _max_accent_len = max(list(map(len, list(_accents_flat)))) 0039 _min_accent_len = min(list(map(len, list(_accents_flat)))) 0040 _accent_len_range = list(range(_max_accent_len, _min_accent_len - 1, -1)) 0041 0042 # FIXME: The graphing sequences with slashes and backslashes are far 0043 # too easy to happen accidentally; think of something better. 0044 _agraphs_unused = { 0045 r"\а": r"а̀", 0046 r"/а": r"а́", 0047 r"\\а": r"а̏", 0048 r"//а": r"а̑", 0049 r"~а": r"а̄", 0050 r"\А": r"А̀", 0051 r"/А": r"А́", 0052 r"\\А": r"А̏", 0053 r"//А": r"А̑", 0054 r"~А": r"А̄", 0055 0056 r"\е": r"ѐ", 0057 r"/е": r"е́", 0058 r"\\е": r"е̏", 0059 r"//е": r"е̑", 0060 r"~е": r"е̄", 0061 r"\Е": r"Ѐ", 0062 r"/Е": r"Е́", 0063 r"\\Е": r"Е̏", 0064 r"//Е": r"Е̑", 0065 r"~Е": r"Е̄", 0066 0067 r"\и": r"ѝ", 0068 r"/и": r"и́", 0069 r"\\и": r"и̏", 0070 r"//и": r"и̑", 0071 r"~и": r"ӣ", 0072 r"\И": r"Ѝ", 0073 r"/И": r"И́", 0074 r"\\И": r"И̏", 0075 r"//И": r"И̑", 0076 r"~И": r"Ӣ", 0077 0078 r"\о": r"о̀", 0079 r"/о": r"о́", 0080 r"\\о": r"о̏", 0081 r"//о": r"о̑", 0082 r"~о": r"о̄", 0083 r"\О": r"О̀", 0084 r"/О": r"О́", 0085 r"\\О": r"О̏", 0086 r"//О": r"О̑", 0087 r"~О": r"О̄", 0088 0089 r"\у": r"у̀", 0090 r"/у": r"у́", 0091 r"\\у": r"у̏", 0092 r"//у": r"у̑", 0093 r"~у": r"ӯ", 0094 r"\У": r"У̀", 0095 r"/У": r"У́", 0096 r"\\У": r"У̏", 0097 r"//У": r"У̑", 0098 r"~У": r"Ӯ", 0099 0100 r"\р": r"р̀", 0101 r"/р": r"р́", 0102 r"\\р": r"р̏", 0103 r"//р": r"р̑", 0104 r"~р": r"р̄", 0105 r"\Р": r"Р̀", 0106 r"/Р": r"Р́", 0107 r"\\Р": r"Р̏", 0108 r"//Р": r"Р̑", 0109 r"~Р": r"Р̄", 0110 } 0111 0112 _agraphs = { 0113 #ur"^а": ur"а̂", 0114 #ur"^о": ur"о̂", 0115 #ur"^А": ur"А̂", 0116 #ur"^О": ur"О̂", 0117 # ...use Latin NFC forms at places for the moment. 0118 r"^а" : r"â", 0119 r"^о" : r"ô", 0120 r"^А" : r"Â", 0121 r"^О" : r"Ô", 0122 } 0123 0124 _max_agraph_len = max(list(map(len, list(_agraphs.keys())))) 0125 _min_agraph_len = min(list(map(len, list(_agraphs.keys())))) 0126 _agraph_len_range = list(range(_max_agraph_len, _min_agraph_len - 1, -1)) 0127 0128 0129 def resolve_agraphs (text): 0130 """ 0131 Convert accent graphs into real accented letters [type F1A hook]. 0132 0133 Accented Cyrillic letters still cannot be widely entered directly 0134 by keyboard, and in such cases this module allows converting graphical 0135 accent-letter representations into actual Unicode compositions. 0136 0137 @note: At the moment, only genitive endings are supported. 0138 0139 @return: text 0140 """ 0141 0142 return _apply_mapping(text, _agraphs, _agraph_len_range) 0143 0144 0145 def remove_accents (text): 0146 """ 0147 Remove accents from all accented letters [type F1A hook]. 0148 0149 Sometimes it is convenient to operate on text without accents, 0150 e.g. when checking spelling. 0151 0152 @return: text 0153 """ 0154 0155 return _apply_mapping(text, _accents_inverted, _accent_len_range) 0156 0157 0158 def _apply_mapping (text, mapping, mlenrange): 0159 0160 p = 0 0161 pp = 0 0162 tsegs = [] 0163 ltext = len(text) 0164 while p < ltext: 0165 for mlen in mlenrange: 0166 mapfrom = text[p:p + mlen] 0167 mapto = mapping.get(mapfrom) 0168 if mapto: 0169 tsegs.append(text[pp:p]) 0170 tsegs.append(mapto) 0171 p += mlen - 1 0172 pp = p + 1 0173 break 0174 p += 1 0175 tsegs.append(text[pp:p]) 0176 0177 return "".join(tsegs) 0178