File indexing completed on 2024-05-12 17:18:07
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Equip text with no-break characters where possibly helpful. 0005 0006 The way text is wrapped in UI, by a general wrapping algorithm, 0007 is sometimes really not appropriate for Serbian ortography. 0008 For example, hyphen-separated case ending should not be wrapped. 0009 This module contains functions to heuristically replace ordinary 0010 with no-break characters, where such bad breaks can be expected. 0011 0012 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0013 @license: GPLv3 0014 """ 0015 0016 import re 0017 0018 from pology import PologyError 0019 from pology.lang.sr.wconv import ctol 0020 0021 0022 nobrhyp_char = "\u2011" 0023 0024 0025 def to_nobr_hyphens (mode=0, wchars="", unsafe=False): 0026 """ 0027 Replace some ordinary hyphens with no-break hyphens [hook factory]. 0028 0029 An ordinary hyphen is replaced in one of the following modes, 0030 as given by the C{mode} parameter: 0031 - 0: if the hyphen is in between two letters, and either preceded 0032 or followed by at most four letters 0033 - 1: if the hyphen is in between two letters and followed by 0034 exactly one letter 0035 0036 Using the C{wchars} parameter, some extra characters other than letters 0037 can be treated as equal to letters. 0038 0039 Note that the function by default substitutes the hyphen only if 0040 there are some Cyrillic letters (or an extra character) in the context, 0041 as otherwise the hyphen may be a part of URL, command, etc. 0042 This can be relaxed by setting C{unsafe} to C{True}, 0043 when all letters are treated equally. 0044 0045 @param mode: replacement mode 0046 @type mode: int 0047 @param wchars: extra characters to consider parts of the word 0048 @type wchars: string 0049 @param unsafe: whether to replace hyphen even if no Cyrillic letters nearby 0050 @type unsafe: bool 0051 0052 @return: type F1A hook 0053 @rtype: C{(text) -> text} 0054 """ 0055 0056 wchars = wchars.replace("-", "") # just in case 0057 0058 nobrhyp_rxstrs = [] 0059 if mode == 0: 0060 # Catching possible replacement by text before hyphen. 0061 nobrhyp_rxstrs.append(r"\b(\w{1,4})(-)([\w%s])" % wchars) 0062 # Catching possible replacement by text after hyphen. 0063 nobrhyp_rxstrs.append(r"([\w%s])(-)(\w{1,4})\b" % wchars) 0064 elif mode == 1: 0065 # Catching possible replacement by text after hyphen. 0066 nobrhyp_rxstrs.append(r"([\w%s])(-)(\w{1})\b" % wchars) 0067 else: 0068 raise PologyError( 0069 _("@info", 0070 "Unknown hyphen replacement mode %(mode)s.", 0071 mode=mode)) 0072 nobrhyp_rxs = [re.compile(x, re.U) for x in nobrhyp_rxstrs] 0073 0074 # Function to produce replacement for matched pattern. 0075 if not unsafe: 0076 def nobrhyp_repl (m): 0077 # Replace hyphen with no-break hyphen only if there is at least one 0078 # Cyrillic letter in the match, or one of extra characters. 0079 if ctol(m.group()) != m.group() or m.group(1) in wchars: 0080 return m.group(1) + nobrhyp_char + m.group(3) 0081 else: 0082 return m.group() 0083 else: 0084 def nobrhyp_repl (m): 0085 # Replace hyphen with no-break hyphen unconditionally. 0086 return m.group(1) + nobrhyp_char + m.group(3) 0087 0088 def hook (text): 0089 0090 # Quick check, is there any hypen at all in the string? 0091 if text.find("-") < 0: 0092 return text 0093 0094 # Replace as long as the string changes, as there are situations 0095 # that the regexes will not catch in one pass (e.g. аб-вг-дђ). 0096 while True: 0097 text_prev = text 0098 for nobrhyp_rx in nobrhyp_rxs: 0099 text = nobrhyp_rx.sub(nobrhyp_repl, text) 0100 if text_prev == text: 0101 break 0102 0103 return text 0104 0105 return hook 0106