File indexing completed on 2024-05-12 05:47:03

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Conversions between character sets in Serbian texts.
0005 
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009 
0010 from pology import _, n_
0011 from pology.report import warning
0012 
0013 
0014 chset_iso8859_5 = set(
0015 " !\"#$%&'()*+,-./0123456789:;<=>?@"
0016 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
0017 "abcdefghijklmnopqrstuvwxyz{|}~\u00a0"
0018 "ЁЂЃЄЅІЇЈЉЊЋЌ­ЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
0019 "абвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ"
0020 )
0021 
0022 translit_iso8859_5 = {
0023 }
0024 
0025 chset_iso8859_2 = set(
0026 " !\"#$%&'()*+,-./0123456789:;<=>?@"
0027 "ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`"
0028 "abcdefghijklmnopqrstuvwxyz{|}~\u00a0"
0029 "Ą˘Ł¤ĽŚ§¨ŠŞŤŹ­ŽŻ°"
0030 "ą˛ł´ľśˇ¸šşťź˝žż"
0031 "ŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢß"
0032 "ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"
0033 )
0034 
0035 translit_iso8859_2 = {
0036     "×": "×",
0037 }
0038 
0039 translit_ascii = {
0040     "—": "--",
0041     "–": "-",
0042     "„": "\"",
0043     "“": "\"",
0044     "‘": "'",
0045     "’": "'",
0046     "€": "EUR",
0047     "©": "c",
0048     "×": "x",
0049     "\u2011": "-", # non-breaking hyphen
0050     "\u00a0": " ", # no-break space
0051     "\u2009": "", # thin space
0052     "\u202f": "", # narrow no-break space
0053     "\u200b": "", # zero-width space
0054     "ä": "ae",
0055     "ö": "oe",
0056     "ü": "ue",
0057     # TODO: Add more.
0058     #"": "",
0059 }
0060 
0061 
0062 def limit_to_isocyr (text):
0063     """
0064     Limit characters to those available in ISO-8859-5 [type F1A hook].
0065 
0066     If a character is neither available in the target character set
0067     nor can be transliterated to it, conversion is undefined,
0068     and warning is reported to stderr.
0069 
0070     @return: text
0071     """
0072 
0073     return _limit_to_chset(text,
0074                            chset_iso8859_5, translit_iso8859_5, "ISO-8859-5")
0075 
0076 
0077 def limit_to_isolat (text):
0078     """
0079     Limit characters to those available in ISO-8859-2 [type F1A hook].
0080 
0081     If a character is neither available in the target character set
0082     nor can be transliterated to it, conversion is undefined,
0083     and warning is reported to stderr.
0084 
0085     @return: text
0086     """
0087 
0088     return _limit_to_chset(text,
0089                            chset_iso8859_2, translit_iso8859_2, "ISO-8859-2")
0090 
0091 
0092 def _limit_to_chset (text, chset, translit, cname):
0093 
0094     ltext = []
0095     for c in text:
0096         if c in chset:
0097             ltext.append(c)
0098             continue
0099         ct = translit.get(c) # must come before translit_ascii
0100         if ct is not None:
0101             ltext.append(ct)
0102             continue
0103         ct = translit_ascii.get(c)
0104         if ct is not None:
0105             ltext.append(ct)
0106             continue
0107         warning(_("@info",
0108                   "Character '%(char)s' (%(code)s) cannot be transliterated "
0109                   "into character set %(charset)s, removing it.",
0110                   char=c, code=("U+%X" % ord(c)), charset=cname))
0111         ltext.append("?")
0112 
0113     return "".join(ltext)
0114