lang/sr/reduce.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Reductions of Serbian text convenient in various special uses.
0005
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009
0010 from pology.lang.sr.accents import remove_accents
0011 from pology.lang.sr.wconv import hictoecq, hictoicq
0012
0013
0014 _srcyr = "абвгдђежзијклљмнњопрстћуфхцчџшАБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ"
0015
0016
0017 def words_ec (text):
0018     """
0019     Reduce text to space-separated Ekavian Cyrillic words [type F1A hook].
0020
0021     Words containing only Serbian Cyrillic characters are extracted,
0022     sorted, and joined by spaces into a string.
0023     In case the text contains dialect and script hybridization,
0024     it is passed through L{hictoec()<lang.sr.wconv.hictoic>}
0025     to resolve it into clean Ekavian Cyrillic.
0026     In case the text contains accent marks, it is passed through
0027     L{remove_accents()<lang.sr.accents.remove_accents>} to remove them.
0028     """
0029
0030     return _words_w(remove_accents(hictoecq(text)))
0031
0032
0033 def words_ec_lw (text):
0034     """
0035     Reduce text to space-separated Ekavian Cyrillic words, in lower case
0036     [type F1A hook].
0037
0038     Like L{words_ec}, but the result is lowercased.
0039     """
0040
0041     return words_ec(text.lower())
0042
0043
0044 def words_ic (text):
0045     """
0046     Reduce text to space-separated Ijekavian Cyrillic words [type F1A hook].
0047
0048     Like L{words_ec}, but if the text was hybrid it is resolved into
0049     clean Ijekavian Cyrillic (see L{hictoic()<lang.sr.wconv.hictoic>}).
0050     """
0051
0052     return _words_w(remove_accents(hictoicq(text)))
0053
0054
0055 def words_ic_lw (text):
0056     """
0057     Reduce text to space-separated Ijekavian Cyrillic words, in lower case
0058     [type F1A hook].
0059
0060     Like L{words_ic}, but the result is lowercased.
0061     """
0062
0063     return words_ic(text.lower())
0064
0065
0066 def _dlc_select (w):
0067
0068     return "е" in w or "и" in w
0069     # ...no len(w) >= 3 because an accelerator marker may have split the word.
0070
0071
0072 def words_ic_lw_dlc (text):
0073     """
0074     Reduce text to space-separated Ijekavian Cyrillic words containing
0075     at least three letters, one of which is 'е' or 'и', in lower case
0076     [type F1A hook].
0077
0078     Like L{words_ic}, but the result is lowercased.
0079     """
0080
0081     return _words_w(remove_accents(hictoicq(text.lower())),
0082                     select=_dlc_select)
0083
0084
0085 def _words_w (text, select=None):
0086
0087     words = []
0088     tlen = len(text)
0089     p = 0
0090     while p < tlen:
0091         while p < tlen and not text[p].isalpha():
0092             p += 1
0093         pp = p
0094         allsrcyr = True
0095         while p < tlen and text[p].isalpha():
0096             if text[p] not in _srcyr:
0097                 allsrcyr = False
0098             p += 1
0099         word = text[pp:p]
0100         if word and allsrcyr and (not select or select(word)):
0101             words.append(word)
0102
0103     words.sort()
0104
0105     return " ".join(words)
0106