File indexing completed on 2024-05-12 05:47:03
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Reductions of Serbian text convenient in various special uses. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 from pology.lang.sr.accents import remove_accents 0011 from pology.lang.sr.wconv import hictoecq, hictoicq 0012 0013 0014 _srcyr = "абвгдђежзијклљмнњопрстћуфхцчџшАБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ" 0015 0016 0017 def words_ec (text): 0018 """ 0019 Reduce text to space-separated Ekavian Cyrillic words [type F1A hook]. 0020 0021 Words containing only Serbian Cyrillic characters are extracted, 0022 sorted, and joined by spaces into a string. 0023 In case the text contains dialect and script hybridization, 0024 it is passed through L{hictoec()<lang.sr.wconv.hictoic>} 0025 to resolve it into clean Ekavian Cyrillic. 0026 In case the text contains accent marks, it is passed through 0027 L{remove_accents()<lang.sr.accents.remove_accents>} to remove them. 0028 """ 0029 0030 return _words_w(remove_accents(hictoecq(text))) 0031 0032 0033 def words_ec_lw (text): 0034 """ 0035 Reduce text to space-separated Ekavian Cyrillic words, in lower case 0036 [type F1A hook]. 0037 0038 Like L{words_ec}, but the result is lowercased. 0039 """ 0040 0041 return words_ec(text.lower()) 0042 0043 0044 def words_ic (text): 0045 """ 0046 Reduce text to space-separated Ijekavian Cyrillic words [type F1A hook]. 0047 0048 Like L{words_ec}, but if the text was hybrid it is resolved into 0049 clean Ijekavian Cyrillic (see L{hictoic()<lang.sr.wconv.hictoic>}). 0050 """ 0051 0052 return _words_w(remove_accents(hictoicq(text))) 0053 0054 0055 def words_ic_lw (text): 0056 """ 0057 Reduce text to space-separated Ijekavian Cyrillic words, in lower case 0058 [type F1A hook]. 0059 0060 Like L{words_ic}, but the result is lowercased. 0061 """ 0062 0063 return words_ic(text.lower()) 0064 0065 0066 def _dlc_select (w): 0067 0068 return "е" in w or "и" in w 0069 # ...no len(w) >= 3 because an accelerator marker may have split the word. 0070 0071 0072 def words_ic_lw_dlc (text): 0073 """ 0074 Reduce text to space-separated Ijekavian Cyrillic words containing 0075 at least three letters, one of which is 'е' or 'и', in lower case 0076 [type F1A hook]. 0077 0078 Like L{words_ic}, but the result is lowercased. 0079 """ 0080 0081 return _words_w(remove_accents(hictoicq(text.lower())), 0082 select=_dlc_select) 0083 0084 0085 def _words_w (text, select=None): 0086 0087 words = [] 0088 tlen = len(text) 0089 p = 0 0090 while p < tlen: 0091 while p < tlen and not text[p].isalpha(): 0092 p += 1 0093 pp = p 0094 allsrcyr = True 0095 while p < tlen and text[p].isalpha(): 0096 if text[p] not in _srcyr: 0097 allsrcyr = False 0098 p += 1 0099 word = text[pp:p] 0100 if word and allsrcyr and (not select or select(word)): 0101 words.append(word) 0102 0103 words.sort() 0104 0105 return " ".join(words) 0106