lang/sr/nobr.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Equip text with no-break characters where possibly helpful.
0005
0006 The way text is wrapped in UI, by a general wrapping algorithm,
0007 is sometimes really not appropriate for Serbian ortography.
0008 For example, hyphen-separated case ending should not be wrapped.
0009 This module contains functions to heuristically replace ordinary
0010 with no-break characters, where such bad breaks can be expected.
0011
0012 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0013 @license: GPLv3
0014 """
0015
0016 import re
0017
0018 from pology import PologyError
0019 from pology.lang.sr.wconv import ctol
0020
0021
0022 nobrhyp_char = "\u2011"
0023
0024
0025 def to_nobr_hyphens (mode=0, wchars="", unsafe=False):
0026     """
0027     Replace some ordinary hyphens with no-break hyphens [hook factory].
0028
0029     An ordinary hyphen is replaced in one of the following modes,
0030     as given by the C{mode} parameter:
0031       - 0: if the hyphen is in between two letters, and either preceded
0032             or followed by at most four letters
0033       - 1: if the hyphen is in between two letters and followed by
0034             exactly one letter
0035
0036     Using the C{wchars} parameter, some extra characters other than letters
0037     can be treated as equal to letters.
0038
0039     Note that the function by default substitutes the hyphen only if
0040     there are some Cyrillic letters (or an extra character) in the context,
0041     as otherwise the hyphen may be a part of URL, command, etc.
0042     This can be relaxed by setting C{unsafe} to C{True},
0043     when all letters are treated equally.
0044
0045     @param mode: replacement mode
0046     @type mode: int
0047     @param wchars: extra characters to consider parts of the word
0048     @type wchars: string
0049     @param unsafe: whether to replace hyphen even if no Cyrillic letters nearby
0050     @type unsafe: bool
0051
0052     @return: type F1A hook
0053     @rtype: C{(text) -> text}
0054     """
0055
0056     wchars = wchars.replace("-", "") # just in case
0057
0058     nobrhyp_rxstrs = []
0059     if mode == 0:
0060         # Catching possible replacement by text before hyphen.
0061         nobrhyp_rxstrs.append(r"\b(\w{1,4})(-)([\w%s])" % wchars)
0062         # Catching possible replacement by text after hyphen.
0063         nobrhyp_rxstrs.append(r"([\w%s])(-)(\w{1,4})\b" % wchars)
0064     elif mode == 1:
0065         # Catching possible replacement by text after hyphen.
0066         nobrhyp_rxstrs.append(r"([\w%s])(-)(\w{1})\b" % wchars)
0067     else:
0068         raise PologyError(
0069             _("@info",
0070               "Unknown hyphen replacement mode %(mode)s.",
0071               mode=mode))
0072     nobrhyp_rxs = [re.compile(x, re.U) for x in nobrhyp_rxstrs]
0073
0074     # Function to produce replacement for matched pattern.
0075     if not unsafe:
0076         def nobrhyp_repl (m):
0077             # Replace hyphen with no-break hyphen only if there is at least one
0078             # Cyrillic letter in the match, or one of extra characters.
0079             if ctol(m.group()) != m.group() or m.group(1) in wchars:
0080                 return m.group(1) + nobrhyp_char + m.group(3)
0081             else:
0082                 return m.group()
0083     else:
0084         def nobrhyp_repl (m):
0085             # Replace hyphen with no-break hyphen unconditionally.
0086             return m.group(1) + nobrhyp_char + m.group(3)
0087
0088     def hook (text):
0089
0090         # Quick check, is there any hypen at all in the string?
0091         if text.find("-") < 0:
0092             return text
0093
0094         # Replace as long as the string changes, as there are situations
0095         # that the regexes will not catch in one pass (e.g. аб-вг-дђ).
0096         while True:
0097             text_prev = text
0098             for nobrhyp_rx in nobrhyp_rxs:
0099                 text = nobrhyp_rx.sub(nobrhyp_repl, text)
0100             if text_prev == text:
0101                 break
0102
0103         return text
0104
0105     return hook
0106