File indexing completed on 2024-05-12 17:18:07

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Constructors of syntagma derivators for trapnakron.
0005 
0006 Trapnakron -- transcriptions and translation of names and acronyms --
0007 is a collection of syntagma derivator definitions residing in
0008 C{pology/lang/sr/trapnakron/}.
0009 Its purpose is to support translation efforts in Serbian language,
0010 where proper nouns and acronyms are frequently transcribed,
0011 and sometimes translated.
0012 For translators, it can be a manual reference, or even directly sourced
0013 in translated material (see below).
0014 For readers, it is a way to obtain original forms of transcribed and
0015 translated phrases.
0016 
0017 Trapnakron web pages are built based on trapnakron source in Pology.
0018 This makes links between original and localized forms readily
0019 available through internet search engines.
0020 Adding C{trapnakron} or C{трапнакрон} keyword to the search phrase
0021 causes the relevant trapnakron page to appear within top few hits,
0022 and the desired other form will be shown already in the excerpt of the hit,
0023 such that is not even necessary to follow it.
0024 This frees translators from the burden of providing original forms
0025 in parenthesis to the first mentioning (or some similar method),
0026 and frees the text of the clutter caused by this.
0027 
0028 While trapnakron definitions may be manually collected and imported into
0029 a basic L{Synder<pology.synder.Synder>} object, this module provides
0030 wrappers which free the user of this manual work, as well as appropriate
0031 transformation functions (C{*tf} parameters to C{Synder} constructor)
0032 to produce various special behaviors on lookups.
0033 Trapnakron constructors are defined by type of textual material,
0034 e.g. for plain text or Docbook documentation.
0035 Documentation of each constructor states what special lookup behaviors
0036 will be available through C{Synder} objects created by it.
0037 
0038 For a short demonstration, consider this derivation of a person's name::
0039 
0040     钱学森, Qián Xuésēn, Tsien Hsue-shen: Ћен| Сјуесен|
0041 
0042 Suppose that a translator wants to source it directly in the text,
0043 rather than to manually copy the transcription (e.g. to avoid having
0044 to update the text should the transcription be modified in the future).
0045 The translator therefore writes, using XML entity syntax::
0046 
0047     ...пројектовању ракета &qianxuesen-g; привукле су идеје...
0048 
0049 where C{-g} denotes genitive case.
0050 This text can be easily processed into the final form (before going out
0051 to readers), using a script based on these few lines::
0052 
0053     >>> from pology.lang.sr.trapnakron import trapnakron_plain
0054     >>> from pology.resolve import resolve_entities_simple as resents
0055     >>> tp = trapnakron_plain()
0056     >>>
0057     >>> s = "...пројектовању ракета &qianxuesen-g; привукле су идеје..."
0058     >>> print resents(s, tp)
0059     ...пројектовању ракета Ћена Сјуесена привукле су идеје...
0060     >>>
0061 
0062 
0063 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0064 @license: GPLv3
0065 """
0066 
0067 import os
0068 import re
0069 
0070 import pology
0071 from pology import PologyError, _, n_
0072 from pology.lang.sr.nobr import to_nobr_hyphens, nobrhyp_char
0073 from pology.lang.sr.wconv import ctol, cltoa
0074 from pology.lang.sr.wconv import hctoc, hctol, hitoe, hitoi, hctocl
0075 from pology.lang.sr.wconv import cltoh, tohi
0076 from pology.fsops import collect_files_by_ext
0077 from pology.normalize import identify, xentitize, simplify
0078 from pology.report import format_item_list
0079 from pology.resolve import first_to_upper
0080 from pology.synder import Synder
0081 
0082 
0083 # Allowed environment compositions, out of, in order:
0084 # Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, Ijekavian Latin.
0085 # 1 indicates environment present, 0 absent.
0086 _good_eicl_combos = set((
0087     "1000", "0100", "0010", "0001",
0088     "1100", "0011", "1010", "0101",
0089     "1111",
0090 ))
0091 
0092 # Elements for composing alternatives directives.
0093 _alt_sep_scr = "¦|/"
0094 _alt_sep_dlc = "¦|/"
0095 
0096 # Keywords of known target markups.
0097 _known_markups = (
0098     "plain",
0099     "xml",
0100     "docbook4",
0101 )
0102 
0103 # Tags found within people names (groups of synonyms).
0104 _pn_tag_first = ("i", "и")
0105 _pn_tag_last = ("p", "п")
0106 _pn_tag_middle = ("s", "с")
0107 _pn_all_tags = set(sum((_pn_tag_first, _pn_tag_last, _pn_tag_middle), ()))
0108 
0109 # Tag for derivations with unimportant keys.
0110 _nokey_tag = "x"
0111 
0112 # Disambiguation marker.
0113 _disamb_marker = "¤"
0114 
0115 # Enumeration of known derivation key suffixes, for modifying derived values.
0116 _suff_pltext = "_ot" # for "obican tekst"
0117 _suff_pltext_id = 10
0118 _suff_ltmarkup = "_lv" # for "laksa varijanta"
0119 _suff_ltmarkup_id = 20
0120 _suff_gnmatch_m = "_rm" # for "rod muski"
0121 _suff_gnmatch_m_id = 30
0122 _suff_gnmatch_z = "_rz" # for "rod zenski"
0123 _suff_gnmatch_z_id = 31
0124 _suff_gnmatch_s = "_rs" # for "rod srednji"
0125 _suff_gnmatch_s_id = 32
0126 _suff_gnmatch_u = "_ru" # for "rod muski zivi"
0127 _suff_gnmatch_u_id = 33
0128 _suff_gnmatch_mk = "_rmk" # for "rod muski mnozine"
0129 _suff_gnmatch_mk_id = 34
0130 _suff_gnmatch_zk = "_rzk" # for "rod zenski mnozine"
0131 _suff_gnmatch_zk_id = 35
0132 _suff_gnmatch_sk = "_rsk" # for "rod srednji mnozine"
0133 _suff_gnmatch_sk_id = 36
0134 _suff_gnmatch_uk = "_ruk" # for "rod muski zivi mnozine"
0135 _suff_gnmatch_uk_id = 37
0136 _gnmatch_suffs = [_suff_gnmatch_m, _suff_gnmatch_z,
0137                   _suff_gnmatch_s, _suff_gnmatch_u,
0138                   _suff_gnmatch_mk, _suff_gnmatch_zk,
0139                   _suff_gnmatch_sk, _suff_gnmatch_uk]
0140 _gnmatch_suff_ids = [_suff_gnmatch_m_id, _suff_gnmatch_z_id,
0141                      _suff_gnmatch_s_id, _suff_gnmatch_u_id,
0142                      _suff_gnmatch_mk_id, _suff_gnmatch_zk_id,
0143                      _suff_gnmatch_sk_id, _suff_gnmatch_uk_id]
0144 _gnmatch_suff_ids_set = set(_gnmatch_suff_ids)
0145 _gnmatch_suffs_genums = [
0146     (_suff_gnmatch_m_id, ("м", "m"), ("ј", "j")),
0147     (_suff_gnmatch_z_id, ("ж", "ž"), ("ј", "j")),
0148     (_suff_gnmatch_s_id, ("с", "s"), ("ј", "j")),
0149     (_suff_gnmatch_u_id, ("у", "u"), ("ј", "j")),
0150     (_suff_gnmatch_mk_id, ("м", "m"), ("к", "k")),
0151     (_suff_gnmatch_zk_id, ("ж", "ž"), ("к", "k")),
0152     (_suff_gnmatch_sk_id, ("с", "s"), ("к", "k")),
0153     (_suff_gnmatch_uk_id, ("у", "u"), ("к", "k")),
0154 ]
0155 _suff_systr = "_s" # for "sistemska transkripcija"
0156 _suff_systr_id = 40
0157 _systr_ksuff_esuff = (_suff_systr, "сист")
0158 _suff_altdv1 = "_a" # for "alternativno izvodjenje"
0159 _suff_altdv1_id = 50
0160 _suff_altdv2 = "_a2" # second alternative
0161 _suff_altdv2_id = 51
0162 _suff_altdv3 = "_a3" # third alternative
0163 _suff_altdv3_id = 52
0164 _altdv_ksuffs_esuffs = [
0165     (_suff_altdv1, "алт"),
0166     (_suff_altdv2, "алт2"),
0167     (_suff_altdv3, "алт3"),
0168 ]
0169 _aenv_suff_ids = [_suff_systr_id, # order of elements significant
0170                   _suff_altdv1_id, _suff_altdv2_id, _suff_altdv3_id]
0171 _aenv_suff_ids_set = set(_aenv_suff_ids)
0172 _suff_pname_f = "_im" # for "ime"
0173 _suff_pname_f_id = 60
0174 _suff_pname_l = "_pr" # for "prezime"
0175 _suff_pname_l_id = 61
0176 _pname_suffs = [_suff_pname_f, _suff_pname_l]
0177 _pname_suff_ids = [_suff_pname_f_id, _suff_pname_l_id]
0178 _pname_suff_ids_set = set(_pname_suff_ids)
0179 
0180 
0181 def trapnakron (envec="", envel="л", envic="иј", envil="ијл",
0182                 markup="plain", tagmap=None,
0183                 ptsuff=None, ltsuff=None, gnsuff=None,
0184                 stsuff=None, adsuff=None, nmsuff=None,
0185                 npkeyto=None, nobrhyp=False, disamb="",
0186                 runtime=False):
0187     """
0188     Main trapnakron constructor, covering all options.
0189 
0190     The trapnakron constructor sets, either by default or optionally,
0191     various transformations to enhance queries to the resulting derivator.
0192 
0193     Default Behavior
0194     ================
0195 
0196     Property values are returned as alternatives/hybridized compositions of
0197     Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, and Ijekavian Latin
0198     forms, as applicable.
0199     Any of these forms can be excluded from derivation by setting
0200     its C{env*} parameter to C{None}.
0201     C{env*} parameters can also be used to change the priority environment
0202     from which the particular form is derived.
0203 
0204     Derivation and property key separator in compound keys is
0205     the ASCII hyphen (C{-}).
0206 
0207     Derivation keys are derived from syntagmas by applying
0208     the L{identify()<normalize.identify>} function.
0209     In derivations where this will result in strange keys,
0210     additional keys should be defined through hidden syntagmas.
0211     Property keys are transliterated into
0212     L{stripped-ASCII<lang.sr.wconv.cltoa>}.
0213 
0214     Conflict resolution for derivation keys is not strict
0215     (see L{derivator constructor<synder.Synder.__init__>}).
0216 
0217     Optional behavior
0218     =================
0219 
0220     Instead of plain text, properties may be reported with some markup.
0221     The markup type is given by C{markup} parameter, and can be one of
0222     C{"plain"}, C{"xml"}, C{"docbook4"}.
0223     The C{tagmap} parameter contains mapping of derivation keys
0224     to tags which should wrap properties of these derivations.
0225 
0226     Derivation keys can have several suffixes which effect how
0227     the properties are reported:
0228       - Presence of the suffix given by C{ptsuff} parameter signals that
0229         properties should be forced to plain text, if another markup is
0230         globally in effect.
0231       - Parameter C{ltsuff} states the suffix which produces lighter version
0232         of the markup, where applicable (e.g. people names in Docbook).
0233       - When fetching a property within a sentence (with keys given e.g.
0234         as XML entities), sentence construction may require that
0235         the resolved value is of certain gender and number; parameter C{gnsuff}
0236         can be used to provide a tuple of 4 suffixes for gender in singular
0237         and 4 suffixes for gender in plural,
0238         such that the property will resolve only if the value of
0239         gender and number matches the gender and number suffix.
0240       - Parameters C{stsuff} and C{adsuff} provide suffixes through
0241         which systematic transcription and alternative derivations
0242         are requested.
0243         They are actually tuples, where the first element is the key suffix,
0244         and the second element the suffix to primary environment
0245         which produces the systematic/alternative environment.
0246         C{adsuff} can also be a tuple of tuples, if several alternative
0247         derivations should be reachable.
0248       - In case the entry is a person's name with tagged first and last name,
0249         parameter C{nmsuff} can provide a tuple of 2 suffixes by which
0250         only the first or last name are requested, respectively.
0251 
0252     Ordinary hyphens may be converted into non-breaking hyphens
0253     by setting the C{nobrhyp} parameter to C{True}.
0254     Non-breaking hyphens are added heuristically, see
0255     the L{to_nobr_hyphens()<lang.sr.nobr.to_nobr_hyphens>} hook.
0256     Useful e.g. to avoid wrapping on hyphen-separated case endings.
0257 
0258     A property key normally cannot be empty, but C{npkeyto} parameter
0259     can be used to automatically substitute another property key
0260     when empty property key is seen in request for properties.
0261     In the simpler version, value of C{npkeyto} is just a string
0262     of the key to substitute for empty.
0263     In the more complex version, the value is a tuple containing
0264     the key to substitute and the list of two or more supplemental
0265     property keys: empty key is replaced only if all supplemental
0266     property values exist and are equal (see e.g. L{trapnakron_plain}
0267     for usage of this).
0268 
0269     Some property values may have been manually decorated with
0270     disambiguation markers (C{¤}), to differentiate them from
0271     property values of another derivation which would otherwise appear
0272     equal under a certain normalization.
0273     By default such markers are removed, but instead they
0274     can be substituted with a string given by C{disamb} parameter.
0275 
0276     Some derivations are defined only for purposes of obtaining
0277     their properties in scripted translations at runtime.
0278     They are by default not included, but can be by setting
0279     the C{runtime} parameter to C{True}.
0280 
0281     @param envec: primary environment for Ekavian Cyrillic derivation
0282     @type envec: string or C{None}
0283     @param envel: primary environment for Ekavian Latin derivation
0284     @type envel: string or C{None}
0285     @param envic: primary environment for Ijekavian Cyrillic derivation
0286     @type envic: string or C{None}
0287     @param envil: primary environment for Ijekavian Latin derivation
0288     @type envil: string or C{None}
0289     @param markup: target markup
0290     @type markup: string
0291     @param tagmap: tags to assign to properties by derivation keys
0292     @type tagmap: dict string -> string
0293     @param ptsuff: derivation key suffix to report plain text properties
0294     @type ptsuff: string
0295     @param ltsuff: derivation key suffix to report properties in lighter markup
0296     @type ltsuff: string
0297     @param gnsuff: suffixes by gender and number, to have no resolution
0298         if gender or number do not match
0299     @type gnsuff: [(string, string)*]
0300     @param stsuff: derivation key and environment name suffixes
0301         to report systematic transcriptions
0302     @type stsuff: (string, string)
0303     @param adsuff: derivation key and environment name suffixes
0304         to report alternative derivations
0305     @type adsuff: (string, string) or ((string, string)*)
0306     @param nmsuff: suffixes for fetching only first or last name of a person
0307     @type nmsuff: (string, string)
0308     @param npkeyto: property key to substitute for empty key, when given
0309     @type npkeyto: string or (string, [string*])
0310     @param nobrhyp: whether to convert some ordinary into non-breaking hyphens
0311     @type nobrhyp: bool
0312     @param disamb: string to replace each disambiguation marker with
0313     @type disamb: string
0314     @param runtime: whether to include runtime-only derivations
0315     @type runtime: bool
0316 
0317     @returns: trapnakron derivator
0318     @rtype: L{Synder<synder.Synder>}
0319     """
0320 
0321     env0s = [envec, envel, envic, envil]
0322     combo =  "".join([(x is not None and "1" or "0") for x in env0s])
0323     if combo not in _good_eicl_combos:
0324         raise PologyError(
0325             _("@info",
0326               "Invalid combination of Ekavian/Ijekavian Cyrillic/Latin "
0327               "environments to trapnakron derivator."))
0328 
0329     if markup not in _known_markups:
0330         raise PologyError(
0331             _("@info",
0332               "Unknown markup type '%(mtype)s' to trapnakron derivator "
0333               "(known markups: %(mtypelist)s).",
0334               mtype=markup, mtypelist=format_item_list(_known_markups)))
0335 
0336     # Compose environment fallback chains.
0337     env = []
0338     envprops = [] # [(islatin, isije)*]
0339     vd = lambda e, d: e if e is not None else d
0340     if envec is not None:
0341         env.append((envec,))
0342         envprops.append((False, False))
0343     if envel is not None:
0344         env.append((envel, vd(envec, "")))
0345         envprops.append((True, False))
0346     if envic is not None:
0347         env.append((envic, vd(envec, "")))
0348         envprops.append((False, True))
0349     if envil is not None:
0350         env.append((envil, vd(envel, "л"), vd(envic, "иј"), vd(envec, "")))
0351         envprops.append((True, True))
0352 
0353     # Setup up requests by derivation key suffix.
0354     mvends = {}
0355     if ptsuff:
0356         mvends[ptsuff] = _suff_pltext_id
0357     if ltsuff:
0358         mvends[ltsuff] = _suff_ltmarkup_id
0359     if gnsuff:
0360         if len(gnsuff) != 8:
0361             raise PologyError(
0362                 _("@info",
0363                   "Sequence of gender-number suffixes must have "
0364                   "exactly 8 elements."))
0365         mvends.update(list(zip(gnsuff, _gnmatch_suff_ids)))
0366     aenvs = {}
0367     if adsuff or stsuff:
0368         kesuffs = [] # must have same order as _aenv_suff_ids
0369         if stsuff is not None:
0370             kesuffs.append(stsuff)
0371         if not isinstance(adsuff[0], tuple):
0372             kesuffs.append(adsuff)
0373         else:
0374             kesuffs.extend(adsuff)
0375         for (ksuff, esuff), suff_id in zip(kesuffs, _aenv_suff_ids):
0376             mvends[ksuff] = suff_id
0377             # Compose environment fallback chain for this suffix.
0378             aenv = []
0379             for env1 in env:
0380                 aenv1 = []
0381                 for esuff1 in (esuff, ""):
0382                     for env0 in env1:
0383                         aenv1.append(env0 + esuff1)
0384                 aenv.append(tuple(aenv1))
0385             aenvs[suff_id] = tuple(aenv)
0386     if nmsuff:
0387         if len(nmsuff) != 2:
0388             raise PologyError(
0389                 _("@info",
0390                   "Sequence of person name suffixes must have "
0391                   "exactly 2 elements."))
0392         mvends.update(list(zip(nmsuff, _pname_suff_ids)))
0393 
0394     # Setup substitution of empty property keys.
0395     expkeys = []
0396     if isinstance(npkeyto, tuple):
0397         npkeyto, expkeys = npkeyto
0398 
0399     # Create transformators.
0400     dkeytf = _sd_dkey_transf(mvends, tagmap)
0401     pkeytf = _sd_pkey_transf(npkeyto, expkeys)
0402     pvaltf = _sd_pval_transf(envprops, markup, nobrhyp, disamb)
0403     ksyntf = _sd_ksyn_transf(markup, False, disamb)
0404     envtf = _sd_env_transf(aenvs)
0405 
0406     # Build the derivator.
0407     sd = Synder(env=env,
0408                 ckeysep="-",
0409                 dkeytf=dkeytf, dkeyitf=identify,
0410                 pkeytf=pkeytf, pkeyitf=norm_pkey,
0411                 pvaltf=pvaltf, ksyntf=ksyntf,
0412                 envtf=envtf,
0413                 strictkey=False)
0414 
0415     # Collect synder files composing the trapnakron.
0416     sdfiles = _get_trapnakron_files(runtime)
0417 
0418     # Import into derivator.
0419     for sdfile in sdfiles:
0420         sd.import_file(sdfile)
0421 
0422     return sd
0423 
0424 
0425 def rootdir ():
0426     """
0427     Get root directory to trapnakron derivation files.
0428 
0429     @returns: root directory path
0430     @rtype: string
0431     """
0432 
0433     return os.path.join(pology.datadir(), "lang", "sr", "trapnakron")
0434 
0435 
0436 def _get_trapnakron_files (runtime=False):
0437 
0438     root = rootdir()
0439     files = collect_files_by_ext(root, ["sd"], recurse=False)
0440     if runtime:
0441         rtroot = os.path.join(root, "runtime")
0442         rtfiles = collect_files_by_ext(rtroot, ["sd"], recurse=False)
0443         files.extend(rtfiles)
0444 
0445     return files
0446 
0447 
0448 def trapnakron_plain (envec="", envel="л", envic="иј", envil="ијл"):
0449     """
0450     Constructs trapnakron suitable for application to plain text.
0451 
0452     Calls L{trapnakron} with the following setup:
0453 
0454       - Markup is plain text (C{plain}).
0455 
0456       - Suffixes: C{_rm} ("rod muski") for resolving the property value only
0457         if it is of masculine gender, C{_rz} for feminine, C{_rs} for neuter;
0458         C{_s} for systematic transcription, C{_a}, C{_a2} and C{_a3} for
0459         other alternatives; C{_im} and C{_pr} for person's last and first name.
0460 
0461       - Non-breaking hyphens are heuristically replacing ordinary hyphens.
0462 
0463       - Empty property key is converted into C{am} (accusative masculine
0464         descriptive adjective), providing that it is equal to C{gm}
0465         (genitive masculine descriptive adjective);
0466         i.e. if the descriptive adjective is invariable.
0467     """
0468 
0469     return trapnakron(
0470         envec, envel, envic, envil,
0471         markup="plain",
0472         gnsuff=_gnmatch_suffs,
0473         stsuff=_systr_ksuff_esuff,
0474         adsuff=_altdv_ksuffs_esuffs,
0475         nmsuff=_pname_suffs,
0476         npkeyto=("am", ("am", "gm")),
0477         nobrhyp=True,
0478     )
0479 
0480 
0481 def trapnakron_ui (envec="", envel="л", envic="иј", envil="ијл"):
0482     """
0483     Constructs trapnakron suitable for application to UI texts.
0484 
0485     Like L{trapnakron_plain}, except that disambiguation markers
0486     are not removed but substituted with an invisible character,
0487     and runtime-only derivations are included too.
0488 
0489     Retaining disambiguation markers is useful when a normalized form
0490     (typically nominative) is used at runtime as key to fetch
0491     other properties of the derivation,
0492     and the normalization is such that it would fold two different
0493     derivations to same keys if the originating forms were left undecorated.
0494     """
0495 
0496     return trapnakron(
0497         envec, envel, envic, envil,
0498         markup="plain",
0499         gnsuff=_gnmatch_suffs,
0500         stsuff=_systr_ksuff_esuff,
0501         adsuff=_altdv_ksuffs_esuffs,
0502         nmsuff=_pname_suffs,
0503         npkeyto=("am", ("am", "gm")),
0504         nobrhyp=True,
0505         disamb="\u2060",
0506         runtime=True,
0507     )
0508 
0509 
0510 def trapnakron_docbook4 (envec="", envel="л", envic="иј", envil="ијл",
0511                          tagmap=None):
0512     """
0513     Constructs trapnakron suitable for application to Docbook 4 texts.
0514 
0515     Calls L{trapnakron} with the following setup:
0516 
0517       - Markup is Docbook 4 (C{docbook4}).
0518 
0519       - Suffixes: C{_ot} ("obican tekst") for plain-text properties,
0520         C{_lv} ("laksa varijanta") for lighter variant of the markup.
0521         Lighter markup currently applies to: people names
0522         (no outer C{<personname>}, e.g. when it should be elideded due to
0523         particular text segmentation on Docbook->PO extraction).
0524         Also the suffixes as for L{trapnakron_plain}.
0525 
0526       - Non-breaking hyphens and empty property keys
0527         are treated like in L{trapnakron_plain}.
0528     """
0529 
0530     return trapnakron(
0531         envec, envel, envic, envil,
0532         markup="docbook4",
0533         tagmap=tagmap,
0534         ptsuff=_suff_pltext,
0535         ltsuff=_suff_ltmarkup,
0536         gnsuff=_gnmatch_suffs,
0537         stsuff=_systr_ksuff_esuff,
0538         adsuff=_altdv_ksuffs_esuffs,
0539         nmsuff=_pname_suffs,
0540         npkeyto=("am", ("am", "gm")),
0541         nobrhyp=True,
0542         runtime=True, # needed for resolution of UI references
0543     )
0544 
0545 
0546 # Transformation for derivation keys:
0547 # - lowercase first letter if upper-case, and indicate value uppercasing
0548 # - strip special suffixes and indicate value modifications based on them
0549 def _sd_dkey_transf (suffspec, tagmap):
0550 
0551     def transf (dkey, sd):
0552 
0553         # Whether to uppercase the first letter of properties.
0554         fcap = dkey[0:1].isupper()
0555         if fcap:
0556             dkey = dkey[0].lower() + dkey[1:]
0557 
0558         # Collect and strip all known special suffixes.
0559         found_suff_ids = set()
0560         while True:
0561             plen_suff_ids = len(found_suff_ids)
0562             for suff, suff_id in list(suffspec.items()):
0563                 if dkey.endswith(suff):
0564                     dkey = dkey[:-len(suff)]
0565                     found_suff_ids.add(suff_id)
0566             if len(found_suff_ids) == plen_suff_ids:
0567                 break
0568 
0569         # Tag which wraps the property values of this derivation.
0570         tag = tagmap.get(dkey) if tagmap else None
0571 
0572         # Whether to use plain text instead of markup, where applicable.
0573         pltext = _suff_pltext_id in found_suff_ids
0574 
0575         # Whether to use lighter variant of the markup, where applicable.
0576         ltmarkup = _suff_ltmarkup_id in found_suff_ids
0577 
0578         # Whether the gender and number is matching.
0579         if _gnmatch_suff_ids_set.intersection(found_suff_ids):
0580             gstr = sd.get2(dkey, "_rod")
0581             nstr = sd.get2(dkey, "_broj", "j")
0582             genders = list(set(map(ctol, hctocl(gstr)))) if gstr else []
0583             numbers = list(set(map(ctol, hctocl(nstr)))) if nstr else []
0584             if (   not (len(genders) == 1) or not (len(numbers) == 1)
0585                 or not all([(   x[0] not in found_suff_ids
0586                              or (genders[0] in x[1] and numbers[0] in x[2]))
0587                             for x in _gnmatch_suffs_genums])
0588             ):
0589                 dkey = None
0590 
0591         # Whether to use one of alternative environments.
0592         esuffid = None
0593         found_aenv_suff_ids = _aenv_suff_ids_set.intersection(found_suff_ids)
0594         if found_aenv_suff_ids:
0595             esuffid = tuple(found_aenv_suff_ids)[0]
0596 
0597         # Whether to select only first or last name (persons).
0598         nsuffid = None
0599         found_pname_suff_ids = _pname_suff_ids_set.intersection(found_suff_ids)
0600         if found_pname_suff_ids:
0601             nsuffid = tuple(found_pname_suff_ids)[0]
0602 
0603         return dkey, fcap, tag, ltmarkup, pltext, esuffid, nsuffid
0604 
0605     return transf, "self"
0606 
0607 
0608 # Transformation for property keys:
0609 # - try to convert empty into non-empty key
0610 def _sd_pkey_transf (npkeyto, npkey_eqpkeys):
0611 
0612     def transf (pkey, dkey, sd):
0613 
0614         # If key not empty, return it as-is.
0615         if pkey:
0616             return pkey
0617 
0618         # Empty ending allowed if all properties requested
0619         # by supplementary keys are both existing and equal.
0620         # In that case, report the indicated key instead of empty.
0621         alleq = True
0622         ref_pval = None
0623         for tpkey in npkey_eqpkeys:
0624             pval = sd.get2(dkey, tpkey)
0625             if pval is None:
0626                 alleq = False
0627                 break
0628             if ref_pval is None:
0629                 ref_pval = pval
0630             elif ref_pval != pval:
0631                 alleq = False
0632                 break
0633         if alleq:
0634             return npkeyto
0635         else:
0636             return pkey
0637 
0638     return transf, "dkey", "self"
0639 
0640 
0641 # Transformation for property values:
0642 # - capitalize on request from key processing
0643 # - add tags on request from key processing
0644 # - optionally replace ordinary with no-break hyphens
0645 # - resolve known taggings according to selected markup
0646 # - add outer tags according to selected markup
0647 # - replace disambiguation markers with invisible characters
0648 # - construct hybridized forms out of multiple values
0649 # If the property key starts with underscore, only hybridization is performed.
0650 def _sd_pval_transf (envprops, markup, nobrhyp, disamb):
0651 
0652     def transf (mtsegs, pkey, dkrest, sd):
0653 
0654         fcap, tag, ltmarkup, pltext, d5, nsuffid = dkrest
0655         if pkey.startswith("_"):
0656             fcap = False
0657             tag = None
0658             pltext = True
0659 
0660         pvals = []
0661         for tsegs, (islatin, isije) in zip(mtsegs, envprops):
0662             if tsegs is None:
0663                 return None
0664             pval1 = _compose_text(tsegs, markup, nobrhyp, disamb,
0665                                   fcap, tag, ltmarkup, pltext, nsuffid,
0666                                   pkey, islatin)
0667             if pval1 is None:
0668                 return None
0669             pvals.append(pval1)
0670 
0671         pval = _hybridize(envprops, pvals)
0672 
0673         return pval
0674 
0675     return transf, "pkey", "dkrest", "self"
0676 
0677 
0678 # Transformation for derivation syntagmas.
0679 # Like for property value transformation,
0680 # except for alternatives/hybridization.
0681 def _sd_ksyn_transf (markup, nobrhyp, disamb):
0682 
0683     def transf (tsegs, dkrest, sd):
0684 
0685         fcap, tag, ltmarkup, pltext, d5, nsuffid = dkrest
0686 
0687         ksyn = _compose_text(tsegs, markup, nobrhyp, disamb,
0688                              fcap, tag, ltmarkup, pltext, nsuffid)
0689 
0690         return ksyn
0691 
0692     return transf, "dkrest", "self"
0693 
0694 
0695 # Transformation for derivation environments.
0696 # Returns a non-default environment on request from keys processing.
0697 def _sd_env_transf (aenvs):
0698 
0699     def transf (env, dkrest):
0700 
0701         d1, d2, d3, d4, esuffid, d6 = dkrest
0702 
0703         if esuffid is not None:
0704             return aenvs[esuffid]
0705         else:
0706             return env
0707 
0708     return transf, "dkrest"
0709 
0710 
0711 def _compose_text (tsegs, markup, nobrhyp, disamb,
0712                    fcap, tag, ltmarkup, pltext, nsuffid,
0713                    pkey=None, tolatin=False):
0714 
0715     # Tagging and escaping.
0716     tagsubs="%(v)s"
0717     vescape = None
0718     if markup in ("xml", "docbook4"):
0719         tagsubs = "<%(t)s>%(v)s</%(t)s>"
0720         vescape = xentitize
0721 
0722     # All unique tags to current segments.
0723     atags = set(sum([x[1] for x in tsegs], []))
0724 
0725     if atags.intersection(_pn_all_tags):
0726         # A person name.
0727         markup_mod = markup if not pltext else "plain"
0728         text = _compose_person_name(tsegs, fcap, markup_mod, ltmarkup, nsuffid,
0729                                     pkey)
0730     else:
0731         # Ordinary derivations.
0732         text = simplify("".join([x[0] for x in tsegs]))
0733         if _nokey_tag in atags and " " in text: # before anything else
0734             text = text[text.find(" "):].lstrip()
0735         if fcap: # before adding outer tags
0736             text = first_to_upper(text)
0737         if vescape: # before adding outer tags
0738             text = vescape(text)
0739         if tag and not pltext:
0740             text = tagsubs % dict(t=tag, v=text)
0741 
0742     if text is None:
0743         return None
0744 
0745     text = text.replace(_disamb_marker, disamb or "")
0746     if nobrhyp: # before conversion to Latin
0747         text = to_nobr_hyphens(unsafe=True)(text)
0748     if tolatin:
0749         text = ctol(text)
0750 
0751     return text
0752 
0753 
0754 # Combine Ekavian/Ijekavian Cyrillic/Latin forms
0755 # into hybrid Ijekavian Cyrillic text.
0756 def _hybridize (envprops, pvals):
0757 
0758     if len(envprops) == 4: # different scripts and dialects
0759         cvalc = tohi(pvals[0], pvals[2], delims=_alt_sep_dlc)
0760         cvall = tohi(pvals[1], pvals[3], delims=_alt_sep_dlc)
0761         if ctol(cvalc) != cvall:
0762             cval = cltoh(cvalc, cvall, delims=_alt_sep_scr, full=True)
0763         else:
0764             cval = cvalc
0765     elif len(envprops) == 2:
0766         if envprops[0][0] == envprops[1][0]: # different dialects
0767             cval = tohi(pvals[0], pvals[1], delims=_alt_sep_dlc)
0768         else: # different scripts
0769             cval = cltoh(pvals[0], pvals[1], delims=_alt_sep_scr, full=True)
0770     else:
0771         cval = pvals[0]
0772 
0773     return cval
0774 
0775 
0776 # Convert tagged person name into destination markup.
0777 def _compose_person_name (tsegs, fcap, markup, light, nsuffid, pkey):
0778 
0779     # Reduce the name to one of its elements if requested.
0780     # If the reduction results in empty string, revert to full name.
0781     upperlast = False
0782     if nsuffid is not None:
0783         ntsegs = []
0784         for seg, tags in tsegs:
0785             tag = tags[0] if len(tags) > 0 else None
0786             if (   (tag in _pn_tag_first and nsuffid == _suff_pname_f_id)
0787                 or (tag in _pn_tag_last and nsuffid == _suff_pname_l_id)
0788             ):
0789                 ntsegs.append((seg, tags))
0790         if "".join([seg for seg, tags in ntsegs]).strip():
0791             tsegs = ntsegs
0792             # Take care to uppercase title to last name ("von", "al", etc.)
0793             # if last name alone is selected.
0794             upperlast = nsuffid == _suff_pname_l_id
0795     # Otherwise, if the requested property is of special type,
0796     # cancel the derivation if full name contains several name elements.
0797     # FIXME: Actually do this once decided how the client should supply
0798     # the test for special keys.
0799     elif False: #pkey and len(pkey) > 2:
0800         seentags = set()
0801         for seg, tags in tsegs:
0802             if not seg.strip():
0803                 continue
0804             tag = tags[0] if len(tags) > 0 else None
0805             if tag in _pn_tag_first:
0806                 seentags.add(_pn_tag_first[0])
0807             elif tag in _pn_tag_last:
0808                 seentags.add(_pn_tag_last[0])
0809             elif tag in _pn_tag_middle:
0810                 seentags.add(_pn_tag_middle[0])
0811             else:
0812                 seentags.add(None)
0813         if len(seentags) > 1:
0814             return None
0815 
0816     if markup == "docbook4":
0817         name_segs = []
0818         for seg, tags in tsegs:
0819             seg = xentitize(seg).strip()
0820             if not seg:
0821                 continue
0822             tag = tags[0] if len(tags) > 0 else None
0823             if tag in _pn_tag_first:
0824                 name_segs.append(" <firstname>%s</firstname>" % seg)
0825             elif tag in _pn_tag_last:
0826                 if upperlast:
0827                     seg = seg[0].upper() + seg[1:]
0828                     upperlast = False
0829                 name_segs.append(" <surname>%s</surname>" % seg)
0830             elif tag in _pn_tag_middle:
0831                 name_segs.append(" <othername>%s</othername>" % seg)
0832             else: # untagged
0833                 name_segs.append(" %s" % seg)
0834         name = "".join(name_segs).strip()
0835         if not light:
0836             name = "<personname>%s</personname>" % name
0837 
0838     else:
0839         name = simplify("".join([seg for seg, tags in tsegs]))
0840         if upperlast:
0841             name = name[0].upper() + name[1:]
0842 
0843     return name
0844 
0845 
0846 def norm_pkey (pkey):
0847     """
0848     Normalize internal property keys in trapnakron.
0849 
0850     @param pkey: property key or keys to normalize
0851     @type pkey: string or (string*) or [string*]
0852 
0853     @returns: normalized keys
0854     @rtype: as input
0855     """
0856 
0857     if isinstance(pkey, str):
0858         return cltoa(pkey)
0859     elif isinstance(pkey, tuple):
0860         return tuple(map(cltoa, pkey))
0861     elif isinstance(pkey, list):
0862         return list(map(cltoa, pkey))
0863     else:
0864         raise PologyError(
0865             _("@info",
0866               "Normalization of property keys requested "
0867               "on unsupported data type '%(type)s'.",
0868               type=type(pkey)))
0869 
0870 
0871 _norm_rtkey_rx = re.compile("\s", re.U)
0872 
0873 def norm_rtkey (text):
0874     """
0875     Normalize text into runtime key for translation scripting.
0876 
0877     @param text: text to normalize into runtime key
0878     @type text: string
0879 
0880     @returns: runtime key
0881     @rtype: string
0882     """
0883 
0884     return _norm_rtkey_rx.sub("", text).lower()
0885