File indexing completed on 2024-05-12 05:47:03
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Constructors of syntagma derivators for trapnakron. 0005 0006 Trapnakron -- transcriptions and translation of names and acronyms -- 0007 is a collection of syntagma derivator definitions residing in 0008 C{pology/lang/sr/trapnakron/}. 0009 Its purpose is to support translation efforts in Serbian language, 0010 where proper nouns and acronyms are frequently transcribed, 0011 and sometimes translated. 0012 For translators, it can be a manual reference, or even directly sourced 0013 in translated material (see below). 0014 For readers, it is a way to obtain original forms of transcribed and 0015 translated phrases. 0016 0017 Trapnakron web pages are built based on trapnakron source in Pology. 0018 This makes links between original and localized forms readily 0019 available through internet search engines. 0020 Adding C{trapnakron} or C{трапнакрон} keyword to the search phrase 0021 causes the relevant trapnakron page to appear within top few hits, 0022 and the desired other form will be shown already in the excerpt of the hit, 0023 such that is not even necessary to follow it. 0024 This frees translators from the burden of providing original forms 0025 in parenthesis to the first mentioning (or some similar method), 0026 and frees the text of the clutter caused by this. 0027 0028 While trapnakron definitions may be manually collected and imported into 0029 a basic L{Synder<pology.synder.Synder>} object, this module provides 0030 wrappers which free the user of this manual work, as well as appropriate 0031 transformation functions (C{*tf} parameters to C{Synder} constructor) 0032 to produce various special behaviors on lookups. 0033 Trapnakron constructors are defined by type of textual material, 0034 e.g. for plain text or Docbook documentation. 0035 Documentation of each constructor states what special lookup behaviors 0036 will be available through C{Synder} objects created by it. 0037 0038 For a short demonstration, consider this derivation of a person's name:: 0039 0040 钱学森, Qián Xuésēn, Tsien Hsue-shen: Ћен| Сјуесен| 0041 0042 Suppose that a translator wants to source it directly in the text, 0043 rather than to manually copy the transcription (e.g. to avoid having 0044 to update the text should the transcription be modified in the future). 0045 The translator therefore writes, using XML entity syntax:: 0046 0047 ...пројектовању ракета &qianxuesen-g; привукле су идеје... 0048 0049 where C{-g} denotes genitive case. 0050 This text can be easily processed into the final form (before going out 0051 to readers), using a script based on these few lines:: 0052 0053 >>> from pology.lang.sr.trapnakron import trapnakron_plain 0054 >>> from pology.resolve import resolve_entities_simple as resents 0055 >>> tp = trapnakron_plain() 0056 >>> 0057 >>> s = "...пројектовању ракета &qianxuesen-g; привукле су идеје..." 0058 >>> print resents(s, tp) 0059 ...пројектовању ракета Ћена Сјуесена привукле су идеје... 0060 >>> 0061 0062 0063 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0064 @license: GPLv3 0065 """ 0066 0067 import os 0068 import re 0069 0070 import pology 0071 from pology import PologyError, _, n_ 0072 from pology.lang.sr.nobr import to_nobr_hyphens, nobrhyp_char 0073 from pology.lang.sr.wconv import ctol, cltoa 0074 from pology.lang.sr.wconv import hctoc, hctol, hitoe, hitoi, hctocl 0075 from pology.lang.sr.wconv import cltoh, tohi 0076 from pology.fsops import collect_files_by_ext 0077 from pology.normalize import identify, xentitize, simplify 0078 from pology.report import format_item_list 0079 from pology.resolve import first_to_upper 0080 from pology.synder import Synder 0081 0082 0083 # Allowed environment compositions, out of, in order: 0084 # Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, Ijekavian Latin. 0085 # 1 indicates environment present, 0 absent. 0086 _good_eicl_combos = set(( 0087 "1000", "0100", "0010", "0001", 0088 "1100", "0011", "1010", "0101", 0089 "1111", 0090 )) 0091 0092 # Elements for composing alternatives directives. 0093 _alt_sep_scr = "¦|/" 0094 _alt_sep_dlc = "¦|/" 0095 0096 # Keywords of known target markups. 0097 _known_markups = ( 0098 "plain", 0099 "xml", 0100 "docbook4", 0101 ) 0102 0103 # Tags found within people names (groups of synonyms). 0104 _pn_tag_first = ("i", "и") 0105 _pn_tag_last = ("p", "п") 0106 _pn_tag_middle = ("s", "с") 0107 _pn_all_tags = set(sum((_pn_tag_first, _pn_tag_last, _pn_tag_middle), ())) 0108 0109 # Tag for derivations with unimportant keys. 0110 _nokey_tag = "x" 0111 0112 # Disambiguation marker. 0113 _disamb_marker = "¤" 0114 0115 # Enumeration of known derivation key suffixes, for modifying derived values. 0116 _suff_pltext = "_ot" # for "obican tekst" 0117 _suff_pltext_id = 10 0118 _suff_ltmarkup = "_lv" # for "laksa varijanta" 0119 _suff_ltmarkup_id = 20 0120 _suff_gnmatch_m = "_rm" # for "rod muski" 0121 _suff_gnmatch_m_id = 30 0122 _suff_gnmatch_z = "_rz" # for "rod zenski" 0123 _suff_gnmatch_z_id = 31 0124 _suff_gnmatch_s = "_rs" # for "rod srednji" 0125 _suff_gnmatch_s_id = 32 0126 _suff_gnmatch_u = "_ru" # for "rod muski zivi" 0127 _suff_gnmatch_u_id = 33 0128 _suff_gnmatch_mk = "_rmk" # for "rod muski mnozine" 0129 _suff_gnmatch_mk_id = 34 0130 _suff_gnmatch_zk = "_rzk" # for "rod zenski mnozine" 0131 _suff_gnmatch_zk_id = 35 0132 _suff_gnmatch_sk = "_rsk" # for "rod srednji mnozine" 0133 _suff_gnmatch_sk_id = 36 0134 _suff_gnmatch_uk = "_ruk" # for "rod muski zivi mnozine" 0135 _suff_gnmatch_uk_id = 37 0136 _gnmatch_suffs = [_suff_gnmatch_m, _suff_gnmatch_z, 0137 _suff_gnmatch_s, _suff_gnmatch_u, 0138 _suff_gnmatch_mk, _suff_gnmatch_zk, 0139 _suff_gnmatch_sk, _suff_gnmatch_uk] 0140 _gnmatch_suff_ids = [_suff_gnmatch_m_id, _suff_gnmatch_z_id, 0141 _suff_gnmatch_s_id, _suff_gnmatch_u_id, 0142 _suff_gnmatch_mk_id, _suff_gnmatch_zk_id, 0143 _suff_gnmatch_sk_id, _suff_gnmatch_uk_id] 0144 _gnmatch_suff_ids_set = set(_gnmatch_suff_ids) 0145 _gnmatch_suffs_genums = [ 0146 (_suff_gnmatch_m_id, ("м", "m"), ("ј", "j")), 0147 (_suff_gnmatch_z_id, ("ж", "ž"), ("ј", "j")), 0148 (_suff_gnmatch_s_id, ("с", "s"), ("ј", "j")), 0149 (_suff_gnmatch_u_id, ("у", "u"), ("ј", "j")), 0150 (_suff_gnmatch_mk_id, ("м", "m"), ("к", "k")), 0151 (_suff_gnmatch_zk_id, ("ж", "ž"), ("к", "k")), 0152 (_suff_gnmatch_sk_id, ("с", "s"), ("к", "k")), 0153 (_suff_gnmatch_uk_id, ("у", "u"), ("к", "k")), 0154 ] 0155 _suff_systr = "_s" # for "sistemska transkripcija" 0156 _suff_systr_id = 40 0157 _systr_ksuff_esuff = (_suff_systr, "сист") 0158 _suff_altdv1 = "_a" # for "alternativno izvodjenje" 0159 _suff_altdv1_id = 50 0160 _suff_altdv2 = "_a2" # second alternative 0161 _suff_altdv2_id = 51 0162 _suff_altdv3 = "_a3" # third alternative 0163 _suff_altdv3_id = 52 0164 _altdv_ksuffs_esuffs = [ 0165 (_suff_altdv1, "алт"), 0166 (_suff_altdv2, "алт2"), 0167 (_suff_altdv3, "алт3"), 0168 ] 0169 _aenv_suff_ids = [_suff_systr_id, # order of elements significant 0170 _suff_altdv1_id, _suff_altdv2_id, _suff_altdv3_id] 0171 _aenv_suff_ids_set = set(_aenv_suff_ids) 0172 _suff_pname_f = "_im" # for "ime" 0173 _suff_pname_f_id = 60 0174 _suff_pname_l = "_pr" # for "prezime" 0175 _suff_pname_l_id = 61 0176 _pname_suffs = [_suff_pname_f, _suff_pname_l] 0177 _pname_suff_ids = [_suff_pname_f_id, _suff_pname_l_id] 0178 _pname_suff_ids_set = set(_pname_suff_ids) 0179 0180 0181 def trapnakron (envec="", envel="л", envic="иј", envil="ијл", 0182 markup="plain", tagmap=None, 0183 ptsuff=None, ltsuff=None, gnsuff=None, 0184 stsuff=None, adsuff=None, nmsuff=None, 0185 npkeyto=None, nobrhyp=False, disamb="", 0186 runtime=False): 0187 """ 0188 Main trapnakron constructor, covering all options. 0189 0190 The trapnakron constructor sets, either by default or optionally, 0191 various transformations to enhance queries to the resulting derivator. 0192 0193 Default Behavior 0194 ================ 0195 0196 Property values are returned as alternatives/hybridized compositions of 0197 Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, and Ijekavian Latin 0198 forms, as applicable. 0199 Any of these forms can be excluded from derivation by setting 0200 its C{env*} parameter to C{None}. 0201 C{env*} parameters can also be used to change the priority environment 0202 from which the particular form is derived. 0203 0204 Derivation and property key separator in compound keys is 0205 the ASCII hyphen (C{-}). 0206 0207 Derivation keys are derived from syntagmas by applying 0208 the L{identify()<normalize.identify>} function. 0209 In derivations where this will result in strange keys, 0210 additional keys should be defined through hidden syntagmas. 0211 Property keys are transliterated into 0212 L{stripped-ASCII<lang.sr.wconv.cltoa>}. 0213 0214 Conflict resolution for derivation keys is not strict 0215 (see L{derivator constructor<synder.Synder.__init__>}). 0216 0217 Optional behavior 0218 ================= 0219 0220 Instead of plain text, properties may be reported with some markup. 0221 The markup type is given by C{markup} parameter, and can be one of 0222 C{"plain"}, C{"xml"}, C{"docbook4"}. 0223 The C{tagmap} parameter contains mapping of derivation keys 0224 to tags which should wrap properties of these derivations. 0225 0226 Derivation keys can have several suffixes which effect how 0227 the properties are reported: 0228 - Presence of the suffix given by C{ptsuff} parameter signals that 0229 properties should be forced to plain text, if another markup is 0230 globally in effect. 0231 - Parameter C{ltsuff} states the suffix which produces lighter version 0232 of the markup, where applicable (e.g. people names in Docbook). 0233 - When fetching a property within a sentence (with keys given e.g. 0234 as XML entities), sentence construction may require that 0235 the resolved value is of certain gender and number; parameter C{gnsuff} 0236 can be used to provide a tuple of 4 suffixes for gender in singular 0237 and 4 suffixes for gender in plural, 0238 such that the property will resolve only if the value of 0239 gender and number matches the gender and number suffix. 0240 - Parameters C{stsuff} and C{adsuff} provide suffixes through 0241 which systematic transcription and alternative derivations 0242 are requested. 0243 They are actually tuples, where the first element is the key suffix, 0244 and the second element the suffix to primary environment 0245 which produces the systematic/alternative environment. 0246 C{adsuff} can also be a tuple of tuples, if several alternative 0247 derivations should be reachable. 0248 - In case the entry is a person's name with tagged first and last name, 0249 parameter C{nmsuff} can provide a tuple of 2 suffixes by which 0250 only the first or last name are requested, respectively. 0251 0252 Ordinary hyphens may be converted into non-breaking hyphens 0253 by setting the C{nobrhyp} parameter to C{True}. 0254 Non-breaking hyphens are added heuristically, see 0255 the L{to_nobr_hyphens()<lang.sr.nobr.to_nobr_hyphens>} hook. 0256 Useful e.g. to avoid wrapping on hyphen-separated case endings. 0257 0258 A property key normally cannot be empty, but C{npkeyto} parameter 0259 can be used to automatically substitute another property key 0260 when empty property key is seen in request for properties. 0261 In the simpler version, value of C{npkeyto} is just a string 0262 of the key to substitute for empty. 0263 In the more complex version, the value is a tuple containing 0264 the key to substitute and the list of two or more supplemental 0265 property keys: empty key is replaced only if all supplemental 0266 property values exist and are equal (see e.g. L{trapnakron_plain} 0267 for usage of this). 0268 0269 Some property values may have been manually decorated with 0270 disambiguation markers (C{¤}), to differentiate them from 0271 property values of another derivation which would otherwise appear 0272 equal under a certain normalization. 0273 By default such markers are removed, but instead they 0274 can be substituted with a string given by C{disamb} parameter. 0275 0276 Some derivations are defined only for purposes of obtaining 0277 their properties in scripted translations at runtime. 0278 They are by default not included, but can be by setting 0279 the C{runtime} parameter to C{True}. 0280 0281 @param envec: primary environment for Ekavian Cyrillic derivation 0282 @type envec: string or C{None} 0283 @param envel: primary environment for Ekavian Latin derivation 0284 @type envel: string or C{None} 0285 @param envic: primary environment for Ijekavian Cyrillic derivation 0286 @type envic: string or C{None} 0287 @param envil: primary environment for Ijekavian Latin derivation 0288 @type envil: string or C{None} 0289 @param markup: target markup 0290 @type markup: string 0291 @param tagmap: tags to assign to properties by derivation keys 0292 @type tagmap: dict string -> string 0293 @param ptsuff: derivation key suffix to report plain text properties 0294 @type ptsuff: string 0295 @param ltsuff: derivation key suffix to report properties in lighter markup 0296 @type ltsuff: string 0297 @param gnsuff: suffixes by gender and number, to have no resolution 0298 if gender or number do not match 0299 @type gnsuff: [(string, string)*] 0300 @param stsuff: derivation key and environment name suffixes 0301 to report systematic transcriptions 0302 @type stsuff: (string, string) 0303 @param adsuff: derivation key and environment name suffixes 0304 to report alternative derivations 0305 @type adsuff: (string, string) or ((string, string)*) 0306 @param nmsuff: suffixes for fetching only first or last name of a person 0307 @type nmsuff: (string, string) 0308 @param npkeyto: property key to substitute for empty key, when given 0309 @type npkeyto: string or (string, [string*]) 0310 @param nobrhyp: whether to convert some ordinary into non-breaking hyphens 0311 @type nobrhyp: bool 0312 @param disamb: string to replace each disambiguation marker with 0313 @type disamb: string 0314 @param runtime: whether to include runtime-only derivations 0315 @type runtime: bool 0316 0317 @returns: trapnakron derivator 0318 @rtype: L{Synder<synder.Synder>} 0319 """ 0320 0321 env0s = [envec, envel, envic, envil] 0322 combo = "".join([(x is not None and "1" or "0") for x in env0s]) 0323 if combo not in _good_eicl_combos: 0324 raise PologyError( 0325 _("@info", 0326 "Invalid combination of Ekavian/Ijekavian Cyrillic/Latin " 0327 "environments to trapnakron derivator.")) 0328 0329 if markup not in _known_markups: 0330 raise PologyError( 0331 _("@info", 0332 "Unknown markup type '%(mtype)s' to trapnakron derivator " 0333 "(known markups: %(mtypelist)s).", 0334 mtype=markup, mtypelist=format_item_list(_known_markups))) 0335 0336 # Compose environment fallback chains. 0337 env = [] 0338 envprops = [] # [(islatin, isije)*] 0339 vd = lambda e, d: e if e is not None else d 0340 if envec is not None: 0341 env.append((envec,)) 0342 envprops.append((False, False)) 0343 if envel is not None: 0344 env.append((envel, vd(envec, ""))) 0345 envprops.append((True, False)) 0346 if envic is not None: 0347 env.append((envic, vd(envec, ""))) 0348 envprops.append((False, True)) 0349 if envil is not None: 0350 env.append((envil, vd(envel, "л"), vd(envic, "иј"), vd(envec, ""))) 0351 envprops.append((True, True)) 0352 0353 # Setup up requests by derivation key suffix. 0354 mvends = {} 0355 if ptsuff: 0356 mvends[ptsuff] = _suff_pltext_id 0357 if ltsuff: 0358 mvends[ltsuff] = _suff_ltmarkup_id 0359 if gnsuff: 0360 if len(gnsuff) != 8: 0361 raise PologyError( 0362 _("@info", 0363 "Sequence of gender-number suffixes must have " 0364 "exactly 8 elements.")) 0365 mvends.update(list(zip(gnsuff, _gnmatch_suff_ids))) 0366 aenvs = {} 0367 if adsuff or stsuff: 0368 kesuffs = [] # must have same order as _aenv_suff_ids 0369 if stsuff is not None: 0370 kesuffs.append(stsuff) 0371 if not isinstance(adsuff[0], tuple): 0372 kesuffs.append(adsuff) 0373 else: 0374 kesuffs.extend(adsuff) 0375 for (ksuff, esuff), suff_id in zip(kesuffs, _aenv_suff_ids): 0376 mvends[ksuff] = suff_id 0377 # Compose environment fallback chain for this suffix. 0378 aenv = [] 0379 for env1 in env: 0380 aenv1 = [] 0381 for esuff1 in (esuff, ""): 0382 for env0 in env1: 0383 aenv1.append(env0 + esuff1) 0384 aenv.append(tuple(aenv1)) 0385 aenvs[suff_id] = tuple(aenv) 0386 if nmsuff: 0387 if len(nmsuff) != 2: 0388 raise PologyError( 0389 _("@info", 0390 "Sequence of person name suffixes must have " 0391 "exactly 2 elements.")) 0392 mvends.update(list(zip(nmsuff, _pname_suff_ids))) 0393 0394 # Setup substitution of empty property keys. 0395 expkeys = [] 0396 if isinstance(npkeyto, tuple): 0397 npkeyto, expkeys = npkeyto 0398 0399 # Create transformators. 0400 dkeytf = _sd_dkey_transf(mvends, tagmap) 0401 pkeytf = _sd_pkey_transf(npkeyto, expkeys) 0402 pvaltf = _sd_pval_transf(envprops, markup, nobrhyp, disamb) 0403 ksyntf = _sd_ksyn_transf(markup, False, disamb) 0404 envtf = _sd_env_transf(aenvs) 0405 0406 # Build the derivator. 0407 sd = Synder(env=env, 0408 ckeysep="-", 0409 dkeytf=dkeytf, dkeyitf=identify, 0410 pkeytf=pkeytf, pkeyitf=norm_pkey, 0411 pvaltf=pvaltf, ksyntf=ksyntf, 0412 envtf=envtf, 0413 strictkey=False) 0414 0415 # Collect synder files composing the trapnakron. 0416 sdfiles = _get_trapnakron_files(runtime) 0417 0418 # Import into derivator. 0419 for sdfile in sdfiles: 0420 sd.import_file(sdfile) 0421 0422 return sd 0423 0424 0425 def rootdir (): 0426 """ 0427 Get root directory to trapnakron derivation files. 0428 0429 @returns: root directory path 0430 @rtype: string 0431 """ 0432 0433 return os.path.join(pology.datadir(), "lang", "sr", "trapnakron") 0434 0435 0436 def _get_trapnakron_files (runtime=False): 0437 0438 root = rootdir() 0439 files = collect_files_by_ext(root, ["sd"], recurse=False) 0440 if runtime: 0441 rtroot = os.path.join(root, "runtime") 0442 rtfiles = collect_files_by_ext(rtroot, ["sd"], recurse=False) 0443 files.extend(rtfiles) 0444 0445 return files 0446 0447 0448 def trapnakron_plain (envec="", envel="л", envic="иј", envil="ијл"): 0449 """ 0450 Constructs trapnakron suitable for application to plain text. 0451 0452 Calls L{trapnakron} with the following setup: 0453 0454 - Markup is plain text (C{plain}). 0455 0456 - Suffixes: C{_rm} ("rod muski") for resolving the property value only 0457 if it is of masculine gender, C{_rz} for feminine, C{_rs} for neuter; 0458 C{_s} for systematic transcription, C{_a}, C{_a2} and C{_a3} for 0459 other alternatives; C{_im} and C{_pr} for person's last and first name. 0460 0461 - Non-breaking hyphens are heuristically replacing ordinary hyphens. 0462 0463 - Empty property key is converted into C{am} (accusative masculine 0464 descriptive adjective), providing that it is equal to C{gm} 0465 (genitive masculine descriptive adjective); 0466 i.e. if the descriptive adjective is invariable. 0467 """ 0468 0469 return trapnakron( 0470 envec, envel, envic, envil, 0471 markup="plain", 0472 gnsuff=_gnmatch_suffs, 0473 stsuff=_systr_ksuff_esuff, 0474 adsuff=_altdv_ksuffs_esuffs, 0475 nmsuff=_pname_suffs, 0476 npkeyto=("am", ("am", "gm")), 0477 nobrhyp=True, 0478 ) 0479 0480 0481 def trapnakron_ui (envec="", envel="л", envic="иј", envil="ијл"): 0482 """ 0483 Constructs trapnakron suitable for application to UI texts. 0484 0485 Like L{trapnakron_plain}, except that disambiguation markers 0486 are not removed but substituted with an invisible character, 0487 and runtime-only derivations are included too. 0488 0489 Retaining disambiguation markers is useful when a normalized form 0490 (typically nominative) is used at runtime as key to fetch 0491 other properties of the derivation, 0492 and the normalization is such that it would fold two different 0493 derivations to same keys if the originating forms were left undecorated. 0494 """ 0495 0496 return trapnakron( 0497 envec, envel, envic, envil, 0498 markup="plain", 0499 gnsuff=_gnmatch_suffs, 0500 stsuff=_systr_ksuff_esuff, 0501 adsuff=_altdv_ksuffs_esuffs, 0502 nmsuff=_pname_suffs, 0503 npkeyto=("am", ("am", "gm")), 0504 nobrhyp=True, 0505 disamb="\u2060", 0506 runtime=True, 0507 ) 0508 0509 0510 def trapnakron_docbook4 (envec="", envel="л", envic="иј", envil="ијл", 0511 tagmap=None): 0512 """ 0513 Constructs trapnakron suitable for application to Docbook 4 texts. 0514 0515 Calls L{trapnakron} with the following setup: 0516 0517 - Markup is Docbook 4 (C{docbook4}). 0518 0519 - Suffixes: C{_ot} ("obican tekst") for plain-text properties, 0520 C{_lv} ("laksa varijanta") for lighter variant of the markup. 0521 Lighter markup currently applies to: people names 0522 (no outer C{<personname>}, e.g. when it should be elideded due to 0523 particular text segmentation on Docbook->PO extraction). 0524 Also the suffixes as for L{trapnakron_plain}. 0525 0526 - Non-breaking hyphens and empty property keys 0527 are treated like in L{trapnakron_plain}. 0528 """ 0529 0530 return trapnakron( 0531 envec, envel, envic, envil, 0532 markup="docbook4", 0533 tagmap=tagmap, 0534 ptsuff=_suff_pltext, 0535 ltsuff=_suff_ltmarkup, 0536 gnsuff=_gnmatch_suffs, 0537 stsuff=_systr_ksuff_esuff, 0538 adsuff=_altdv_ksuffs_esuffs, 0539 nmsuff=_pname_suffs, 0540 npkeyto=("am", ("am", "gm")), 0541 nobrhyp=True, 0542 runtime=True, # needed for resolution of UI references 0543 ) 0544 0545 0546 # Transformation for derivation keys: 0547 # - lowercase first letter if upper-case, and indicate value uppercasing 0548 # - strip special suffixes and indicate value modifications based on them 0549 def _sd_dkey_transf (suffspec, tagmap): 0550 0551 def transf (dkey, sd): 0552 0553 # Whether to uppercase the first letter of properties. 0554 fcap = dkey[0:1].isupper() 0555 if fcap: 0556 dkey = dkey[0].lower() + dkey[1:] 0557 0558 # Collect and strip all known special suffixes. 0559 found_suff_ids = set() 0560 while True: 0561 plen_suff_ids = len(found_suff_ids) 0562 for suff, suff_id in list(suffspec.items()): 0563 if dkey.endswith(suff): 0564 dkey = dkey[:-len(suff)] 0565 found_suff_ids.add(suff_id) 0566 if len(found_suff_ids) == plen_suff_ids: 0567 break 0568 0569 # Tag which wraps the property values of this derivation. 0570 tag = tagmap.get(dkey) if tagmap else None 0571 0572 # Whether to use plain text instead of markup, where applicable. 0573 pltext = _suff_pltext_id in found_suff_ids 0574 0575 # Whether to use lighter variant of the markup, where applicable. 0576 ltmarkup = _suff_ltmarkup_id in found_suff_ids 0577 0578 # Whether the gender and number is matching. 0579 if _gnmatch_suff_ids_set.intersection(found_suff_ids): 0580 gstr = sd.get2(dkey, "_rod") 0581 nstr = sd.get2(dkey, "_broj", "j") 0582 genders = list(set(map(ctol, hctocl(gstr)))) if gstr else [] 0583 numbers = list(set(map(ctol, hctocl(nstr)))) if nstr else [] 0584 if ( not (len(genders) == 1) or not (len(numbers) == 1) 0585 or not all([( x[0] not in found_suff_ids 0586 or (genders[0] in x[1] and numbers[0] in x[2])) 0587 for x in _gnmatch_suffs_genums]) 0588 ): 0589 dkey = None 0590 0591 # Whether to use one of alternative environments. 0592 esuffid = None 0593 found_aenv_suff_ids = _aenv_suff_ids_set.intersection(found_suff_ids) 0594 if found_aenv_suff_ids: 0595 esuffid = tuple(found_aenv_suff_ids)[0] 0596 0597 # Whether to select only first or last name (persons). 0598 nsuffid = None 0599 found_pname_suff_ids = _pname_suff_ids_set.intersection(found_suff_ids) 0600 if found_pname_suff_ids: 0601 nsuffid = tuple(found_pname_suff_ids)[0] 0602 0603 return dkey, fcap, tag, ltmarkup, pltext, esuffid, nsuffid 0604 0605 return transf, "self" 0606 0607 0608 # Transformation for property keys: 0609 # - try to convert empty into non-empty key 0610 def _sd_pkey_transf (npkeyto, npkey_eqpkeys): 0611 0612 def transf (pkey, dkey, sd): 0613 0614 # If key not empty, return it as-is. 0615 if pkey: 0616 return pkey 0617 0618 # Empty ending allowed if all properties requested 0619 # by supplementary keys are both existing and equal. 0620 # In that case, report the indicated key instead of empty. 0621 alleq = True 0622 ref_pval = None 0623 for tpkey in npkey_eqpkeys: 0624 pval = sd.get2(dkey, tpkey) 0625 if pval is None: 0626 alleq = False 0627 break 0628 if ref_pval is None: 0629 ref_pval = pval 0630 elif ref_pval != pval: 0631 alleq = False 0632 break 0633 if alleq: 0634 return npkeyto 0635 else: 0636 return pkey 0637 0638 return transf, "dkey", "self" 0639 0640 0641 # Transformation for property values: 0642 # - capitalize on request from key processing 0643 # - add tags on request from key processing 0644 # - optionally replace ordinary with no-break hyphens 0645 # - resolve known taggings according to selected markup 0646 # - add outer tags according to selected markup 0647 # - replace disambiguation markers with invisible characters 0648 # - construct hybridized forms out of multiple values 0649 # If the property key starts with underscore, only hybridization is performed. 0650 def _sd_pval_transf (envprops, markup, nobrhyp, disamb): 0651 0652 def transf (mtsegs, pkey, dkrest, sd): 0653 0654 fcap, tag, ltmarkup, pltext, d5, nsuffid = dkrest 0655 if pkey.startswith("_"): 0656 fcap = False 0657 tag = None 0658 pltext = True 0659 0660 pvals = [] 0661 for tsegs, (islatin, isije) in zip(mtsegs, envprops): 0662 if tsegs is None: 0663 return None 0664 pval1 = _compose_text(tsegs, markup, nobrhyp, disamb, 0665 fcap, tag, ltmarkup, pltext, nsuffid, 0666 pkey, islatin) 0667 if pval1 is None: 0668 return None 0669 pvals.append(pval1) 0670 0671 pval = _hybridize(envprops, pvals) 0672 0673 return pval 0674 0675 return transf, "pkey", "dkrest", "self" 0676 0677 0678 # Transformation for derivation syntagmas. 0679 # Like for property value transformation, 0680 # except for alternatives/hybridization. 0681 def _sd_ksyn_transf (markup, nobrhyp, disamb): 0682 0683 def transf (tsegs, dkrest, sd): 0684 0685 fcap, tag, ltmarkup, pltext, d5, nsuffid = dkrest 0686 0687 ksyn = _compose_text(tsegs, markup, nobrhyp, disamb, 0688 fcap, tag, ltmarkup, pltext, nsuffid) 0689 0690 return ksyn 0691 0692 return transf, "dkrest", "self" 0693 0694 0695 # Transformation for derivation environments. 0696 # Returns a non-default environment on request from keys processing. 0697 def _sd_env_transf (aenvs): 0698 0699 def transf (env, dkrest): 0700 0701 d1, d2, d3, d4, esuffid, d6 = dkrest 0702 0703 if esuffid is not None: 0704 return aenvs[esuffid] 0705 else: 0706 return env 0707 0708 return transf, "dkrest" 0709 0710 0711 def _compose_text (tsegs, markup, nobrhyp, disamb, 0712 fcap, tag, ltmarkup, pltext, nsuffid, 0713 pkey=None, tolatin=False): 0714 0715 # Tagging and escaping. 0716 tagsubs="%(v)s" 0717 vescape = None 0718 if markup in ("xml", "docbook4"): 0719 tagsubs = "<%(t)s>%(v)s</%(t)s>" 0720 vescape = xentitize 0721 0722 # All unique tags to current segments. 0723 atags = set(sum([x[1] for x in tsegs], [])) 0724 0725 if atags.intersection(_pn_all_tags): 0726 # A person name. 0727 markup_mod = markup if not pltext else "plain" 0728 text = _compose_person_name(tsegs, fcap, markup_mod, ltmarkup, nsuffid, 0729 pkey) 0730 else: 0731 # Ordinary derivations. 0732 text = simplify("".join([x[0] for x in tsegs])) 0733 if _nokey_tag in atags and " " in text: # before anything else 0734 text = text[text.find(" "):].lstrip() 0735 if fcap: # before adding outer tags 0736 text = first_to_upper(text) 0737 if vescape: # before adding outer tags 0738 text = vescape(text) 0739 if tag and not pltext: 0740 text = tagsubs % dict(t=tag, v=text) 0741 0742 if text is None: 0743 return None 0744 0745 text = text.replace(_disamb_marker, disamb or "") 0746 if nobrhyp: # before conversion to Latin 0747 text = to_nobr_hyphens(unsafe=True)(text) 0748 if tolatin: 0749 text = ctol(text) 0750 0751 return text 0752 0753 0754 # Combine Ekavian/Ijekavian Cyrillic/Latin forms 0755 # into hybrid Ijekavian Cyrillic text. 0756 def _hybridize (envprops, pvals): 0757 0758 if len(envprops) == 4: # different scripts and dialects 0759 cvalc = tohi(pvals[0], pvals[2], delims=_alt_sep_dlc) 0760 cvall = tohi(pvals[1], pvals[3], delims=_alt_sep_dlc) 0761 if ctol(cvalc) != cvall: 0762 cval = cltoh(cvalc, cvall, delims=_alt_sep_scr, full=True) 0763 else: 0764 cval = cvalc 0765 elif len(envprops) == 2: 0766 if envprops[0][0] == envprops[1][0]: # different dialects 0767 cval = tohi(pvals[0], pvals[1], delims=_alt_sep_dlc) 0768 else: # different scripts 0769 cval = cltoh(pvals[0], pvals[1], delims=_alt_sep_scr, full=True) 0770 else: 0771 cval = pvals[0] 0772 0773 return cval 0774 0775 0776 # Convert tagged person name into destination markup. 0777 def _compose_person_name (tsegs, fcap, markup, light, nsuffid, pkey): 0778 0779 # Reduce the name to one of its elements if requested. 0780 # If the reduction results in empty string, revert to full name. 0781 upperlast = False 0782 if nsuffid is not None: 0783 ntsegs = [] 0784 for seg, tags in tsegs: 0785 tag = tags[0] if len(tags) > 0 else None 0786 if ( (tag in _pn_tag_first and nsuffid == _suff_pname_f_id) 0787 or (tag in _pn_tag_last and nsuffid == _suff_pname_l_id) 0788 ): 0789 ntsegs.append((seg, tags)) 0790 if "".join([seg for seg, tags in ntsegs]).strip(): 0791 tsegs = ntsegs 0792 # Take care to uppercase title to last name ("von", "al", etc.) 0793 # if last name alone is selected. 0794 upperlast = nsuffid == _suff_pname_l_id 0795 # Otherwise, if the requested property is of special type, 0796 # cancel the derivation if full name contains several name elements. 0797 # FIXME: Actually do this once decided how the client should supply 0798 # the test for special keys. 0799 elif False: #pkey and len(pkey) > 2: 0800 seentags = set() 0801 for seg, tags in tsegs: 0802 if not seg.strip(): 0803 continue 0804 tag = tags[0] if len(tags) > 0 else None 0805 if tag in _pn_tag_first: 0806 seentags.add(_pn_tag_first[0]) 0807 elif tag in _pn_tag_last: 0808 seentags.add(_pn_tag_last[0]) 0809 elif tag in _pn_tag_middle: 0810 seentags.add(_pn_tag_middle[0]) 0811 else: 0812 seentags.add(None) 0813 if len(seentags) > 1: 0814 return None 0815 0816 if markup == "docbook4": 0817 name_segs = [] 0818 for seg, tags in tsegs: 0819 seg = xentitize(seg).strip() 0820 if not seg: 0821 continue 0822 tag = tags[0] if len(tags) > 0 else None 0823 if tag in _pn_tag_first: 0824 name_segs.append(" <firstname>%s</firstname>" % seg) 0825 elif tag in _pn_tag_last: 0826 if upperlast: 0827 seg = seg[0].upper() + seg[1:] 0828 upperlast = False 0829 name_segs.append(" <surname>%s</surname>" % seg) 0830 elif tag in _pn_tag_middle: 0831 name_segs.append(" <othername>%s</othername>" % seg) 0832 else: # untagged 0833 name_segs.append(" %s" % seg) 0834 name = "".join(name_segs).strip() 0835 if not light: 0836 name = "<personname>%s</personname>" % name 0837 0838 else: 0839 name = simplify("".join([seg for seg, tags in tsegs])) 0840 if upperlast: 0841 name = name[0].upper() + name[1:] 0842 0843 return name 0844 0845 0846 def norm_pkey (pkey): 0847 """ 0848 Normalize internal property keys in trapnakron. 0849 0850 @param pkey: property key or keys to normalize 0851 @type pkey: string or (string*) or [string*] 0852 0853 @returns: normalized keys 0854 @rtype: as input 0855 """ 0856 0857 if isinstance(pkey, str): 0858 return cltoa(pkey) 0859 elif isinstance(pkey, tuple): 0860 return tuple(map(cltoa, pkey)) 0861 elif isinstance(pkey, list): 0862 return list(map(cltoa, pkey)) 0863 else: 0864 raise PologyError( 0865 _("@info", 0866 "Normalization of property keys requested " 0867 "on unsupported data type '%(type)s'.", 0868 type=type(pkey))) 0869 0870 0871 _norm_rtkey_rx = re.compile("\s", re.U) 0872 0873 def norm_rtkey (text): 0874 """ 0875 Normalize text into runtime key for translation scripting. 0876 0877 @param text: text to normalize into runtime key 0878 @type text: string 0879 0880 @returns: runtime key 0881 @rtype: string 0882 """ 0883 0884 return _norm_rtkey_rx.sub("", text).lower() 0885