File indexing completed on 2024-05-12 17:18:07
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Conversions between character sets in Serbian texts. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 from pology import _, n_ 0011 from pology.report import warning 0012 0013 0014 chset_iso8859_5 = set( 0015 " !\"#$%&'()*+,-./0123456789:;<=>?@" 0016 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" 0017 "abcdefghijklmnopqrstuvwxyz{|}~\u00a0" 0018 "ЁЂЃЄЅІЇЈЉЊЋЌЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" 0019 "абвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ" 0020 ) 0021 0022 translit_iso8859_5 = { 0023 } 0024 0025 chset_iso8859_2 = set( 0026 " !\"#$%&'()*+,-./0123456789:;<=>?@" 0027 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" 0028 "abcdefghijklmnopqrstuvwxyz{|}~\u00a0" 0029 "Ą˘Ł¤ĽŚ§¨ŠŞŤŹŽŻ°" 0030 "ą˛ł´ľśˇ¸šşťź˝žż" 0031 "ŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢß" 0032 "ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙" 0033 ) 0034 0035 translit_iso8859_2 = { 0036 "×": "×", 0037 } 0038 0039 translit_ascii = { 0040 "—": "--", 0041 "–": "-", 0042 "„": "\"", 0043 "“": "\"", 0044 "‘": "'", 0045 "’": "'", 0046 "€": "EUR", 0047 "©": "c", 0048 "×": "x", 0049 "\u2011": "-", # non-breaking hyphen 0050 "\u00a0": " ", # no-break space 0051 "\u2009": "", # thin space 0052 "\u202f": "", # narrow no-break space 0053 "\u200b": "", # zero-width space 0054 "ä": "ae", 0055 "ö": "oe", 0056 "ü": "ue", 0057 # TODO: Add more. 0058 #"": "", 0059 } 0060 0061 0062 def limit_to_isocyr (text): 0063 """ 0064 Limit characters to those available in ISO-8859-5 [type F1A hook]. 0065 0066 If a character is neither available in the target character set 0067 nor can be transliterated to it, conversion is undefined, 0068 and warning is reported to stderr. 0069 0070 @return: text 0071 """ 0072 0073 return _limit_to_chset(text, 0074 chset_iso8859_5, translit_iso8859_5, "ISO-8859-5") 0075 0076 0077 def limit_to_isolat (text): 0078 """ 0079 Limit characters to those available in ISO-8859-2 [type F1A hook]. 0080 0081 If a character is neither available in the target character set 0082 nor can be transliterated to it, conversion is undefined, 0083 and warning is reported to stderr. 0084 0085 @return: text 0086 """ 0087 0088 return _limit_to_chset(text, 0089 chset_iso8859_2, translit_iso8859_2, "ISO-8859-2") 0090 0091 0092 def _limit_to_chset (text, chset, translit, cname): 0093 0094 ltext = [] 0095 for c in text: 0096 if c in chset: 0097 ltext.append(c) 0098 continue 0099 ct = translit.get(c) # must come before translit_ascii 0100 if ct is not None: 0101 ltext.append(ct) 0102 continue 0103 ct = translit_ascii.get(c) 0104 if ct is not None: 0105 ltext.append(ct) 0106 continue 0107 warning(_("@info", 0108 "Character '%(char)s' (%(code)s) cannot be transliterated " 0109 "into character set %(charset)s, removing it.", 0110 char=c, code=("U+%X" % ord(c)), charset=cname)) 0111 ltext.append("?") 0112 0113 return "".join(ltext) 0114