File indexing completed on 2024-04-14 05:37:53
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Text wrapping, with special handling for typical texts in PO files. 0005 0006 Wrapping turns out to be quite a non-trivial matter. 0007 Gettext itself implements an intricate wrapping algorithm from the Unicode 0008 consortium, with its own tweaks, which is hard to beat in any simpler way. 0009 Thus, do not be surprised if the wrapping quality offered by this module does 0010 not meet your exact needs. 0011 0012 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0013 @license: GPLv3 0014 """ 0015 0016 import re 0017 import unicodedata 0018 0019 from pology import PologyError, _, n_ 0020 0021 0022 # Regex for splitting C{<...>} into tag name and few other elements. 0023 _tag_split_rx = re.compile(r"^\s*<\s*(/?)\s*(\w+)[^/>]*(/?)\s*>\s*$") 0024 0025 # Characters for "natural" breaks where to wrap the text. 0026 _natbr_after = ".,;/-)]}" 0027 _natbr_before = "%({[" 0028 0029 # Strings at which the text should be wrapped before or after. 0030 _prebr = ("|/|",) 0031 _postbr = (("\\n", "\\\\n"), "|/|") 0032 # |/| is the Transcript fence, should break both before and after. 0033 0034 # Tags for normal breaking (after the closed tag) 0035 _tagbr_normal = ( 0036 # HTML 0037 "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", 0038 "table", "th", "td", "tr", "center", "blockquote", "pre", "dd", "dl", "dt", 0039 # KUIT 0040 "title", "subtitle", "para", "list", "item", 0041 # Docbook 0042 "calloutlist", "glosslist", "itemizedlist", "orderedlist", "segmentedlist", 0043 "simplelist", "variablelist", "listitem", "seglistitem", "varlistentry", 0044 ) 0045 0046 # Tags usually closed in-place in strict XML, break before and after. 0047 _tagbr_inplace = ( 0048 # HTML 0049 "br", "hr", 0050 # KUIT 0051 "nl", 0052 ) 0053 0054 0055 def _tag_split (tag): 0056 """ 0057 Split tag statement into tag name and a state string. 0058 0059 State is one of "open" (<foo>), "close" (</foo>), or "inplace" (<foo/>). 0060 0061 @param tag: the tag proper, C{<...>} 0062 @type tag: string 0063 @returns: tag name and state 0064 @rtype: string, string 0065 """ 0066 0067 m = _tag_split_rx.match(tag) 0068 if m: 0069 if m.group(1): 0070 state = "close" 0071 elif m.group(3): 0072 state = "inplace" 0073 else: 0074 state = "open" 0075 return m.group(2), state 0076 else: 0077 return "", "" 0078 0079 0080 def wrap_text (text, wcol=79, lead="", trail="", flead=None, femp=False, 0081 natbr="", natbr2="", prebr=(), postbr=(), tagbr=(), tagbr2=(), 0082 wcolmin=0, midbr=True, remtrws=False, endl="\n"): 0083 """ 0084 Wrap text into lines. 0085 0086 Wrapping behavior and positions can be controlled by several parameters. 0087 Trailing and leading strings can be added to each wrapped line, including 0088 a special lead for the first line. 0089 0090 If wrapping column is given as less or equal to zero, the lines are split 0091 only at unconditional breaks. 0092 0093 This is a very general wrapping function, see the more specialized ones in 0094 this module for practical use with PO message elements. 0095 0096 @param text: the text to wrap 0097 @type text: string 0098 @param wcol: column to wrap after 0099 @type wcol: int 0100 @param lead: prefix for each line 0101 @type lead: string 0102 @param trail: suffix for each line 0103 @type trail: string 0104 @param flead: 0105 special suffix for the first line. Normal suffix is used if this is 0106 given as C{None} 0107 @type flead: C{None} or string 0108 @param femp: 0109 C{True} to leave the first line empty if the complete text would not 0110 fit into it, C{False} for normal use of the first line 0111 @type femp: bool 0112 @param natbr: characters other than space to naturally break at 0113 @type natbr: string 0114 @param natbr2: characters other than space to naturally break at, 0115 also taking the breaking character to the next line 0116 @type natbr2: string 0117 @param prebr: character sequences to unconditionally break before 0118 @type prebr: (string*) 0119 @param postbr: character sequences to unconditionally break after 0120 @type postbr: (string*) 0121 @param tagbr: tag names to break before opening and after closing 0122 @type tagbr: (string*) 0123 @param tagbr2: tag names to always break after (like <br>) 0124 @type tagbr2: (string*) 0125 @param wcolmin: minimal column to allow natural breaks after 0126 @type wcolmin: int 0127 @param midbr: 0128 C{True} to allow break in the middle of a word if no usual break 0129 found before C{wcol} has been exceeded 0130 @type midbr: bool 0131 @param remtrws: 0132 whether to strictly remove any trailing whitespace in wrapped lines 0133 (otherwise trailing whitespace may be left in under certain conditions) 0134 @type remtrws: bool 0135 @param endl: line end marker for each line 0136 @type endl: string 0137 @returns: wrapped lines 0138 @rtype: [string*] 0139 """ 0140 0141 if flead is None: 0142 flead = lead 0143 0144 rlentext = len(text) 0145 atoms = _atomize(text)[:-1] # strip sentry 0146 vlenlead = _atomize(lead)[-1][2] 0147 vlentrail = _atomize(trail)[-1][2] 0148 vlenflead = _atomize(flead)[-1][2] 0149 0150 if wcol > 0 and vlenlead + vlentrail + 1 >= wcol: 0151 raise PologyError( 0152 _("@info", 0153 "Wrapping is too tight, cannot fit leading and trailing text.")) 0154 0155 lines = [] # list of lines 0156 nlines = 0 0157 lenatoms = len(atoms) 0158 p = 0 # position into original text by atoms 0159 vtext = "".join(x[0] for x in atoms) 0160 vposs = tuple(x[2] for x in atoms) 0161 rvposs = tuple(x[6] for x in atoms) 0162 while p < lenatoms: 0163 # Determine effective wrapping column for this line. 0164 ewcol = wcol - 1 - vlentrail # -1 for newline character 0165 if nlines == 0: 0166 clead = flead 0167 ewcol -= vlenflead 0168 else: 0169 clead = lead 0170 ewcol -= vlenlead 0171 0172 # Find where to wrap. 0173 atbr = False # immediate break found 0174 pl = 0 # position into current line 0175 ple = 0 #b apparent position into current line 0176 pl_ok = 0 # last good position into current line (where wrap was fine) 0177 ple_ok = 0 # last good apparent position into current line 0178 pvseg, pvlen = "", 0 0179 while ( p + pl < lenatoms 0180 and (ple <= ewcol or wcol <= 0 or (not midbr and pl_ok == 0)) 0181 and not atbr 0182 ): 0183 if pl > 0: 0184 pvseg, pvlen = atoms[p + pl - 1][:2] 0185 cvseg, cvlen = atoms[p + pl][:2] 0186 if postbr or tagbr or tagbr2: # condition for optimization 0187 backvtext = vtext[rvposs[p]:rvposs[p + pl]] 0188 if prebr or tagbr: # condition for optimization 0189 forevtext = vtext[rvposs[p + pl]:] 0190 0191 # Immediate breaks allowed only after 0192 # at least one visually non-empty atom. 0193 if vposs[p + pl] > vposs[p]: 0194 0195 # Check for an immediate break by sequence. 0196 for br in postbr: 0197 if not isinstance(br, tuple): 0198 if backvtext.endswith(br): 0199 atbr = True; break 0200 else: 0201 br1, br2 = br 0202 if ( backvtext.endswith(br1) 0203 and not backvtext.endswith(br2) 0204 ): 0205 atbr = True; break 0206 if atbr: break 0207 for br in prebr: 0208 if forevtext.startswith(br): 0209 atbr = True; break 0210 if atbr: break 0211 0212 # Check for an immediate break by tag. 0213 if tagbr or tagbr2: 0214 if backvtext.endswith(">"): 0215 pt = backvtext.rfind("<", 0, -1) 0216 if pt >= 0: 0217 tag, state = _tag_split(backvtext[pt:]) 0218 if ( (tag in tagbr2) 0219 or ( tag in tagbr 0220 and state in ("close", "inplace")) 0221 ): 0222 atbr = True; break 0223 if tagbr: 0224 if forevtext.startswith("<"): 0225 pt = forevtext.find(">", 1) 0226 if pt >= 0: 0227 tag, state = _tag_split(forevtext[:pt+1]) 0228 if tag in tagbr and state == "open": 0229 atbr = True; break 0230 0231 # Check for valid natural break. 0232 if ( pvseg in " " 0233 or (cvseg != " " and pvseg in natbr and cvseg not in natbr) 0234 or cvseg in natbr2 0235 ): 0236 pl_ok = pl 0237 ple_ok = ple 0238 0239 ple += pvlen 0240 pl += 1 0241 0242 # If not unconditional break, still enough text, and break possible. 0243 if not atbr and ple > ewcol and ewcol > 0: 0244 # Don't allow too short natural break. 0245 if ple_ok > wcolmin: 0246 pl = pl_ok 0247 ple = ple_ok 0248 # Backstep any segments still too much if mid-word break allowed. 0249 if midbr: 0250 while pl > 1 and ple > ewcol: 0251 pl -= 1 0252 ple -= atoms[pl][1] 0253 0254 # Never break after non-final backslash. 0255 if p + pl < lenatoms: 0256 while pl > 1 and atoms[p + pl - 1][0] == "\\": 0257 pl -= 1 0258 ple -= atoms[p + pl][1] 0259 0260 if ( nlines == 0 0261 and ((femp and p + pl < lenatoms) or (ewcol <= 0 and wcol > 0)) 0262 ): 0263 # leaving first line empty 0264 lines.append(clead + trail) 0265 pl = 0 0266 else: 0267 p1 = atoms[p][4] 0268 p2 = atoms[p + pl][4] if p + pl < lenatoms else rlentext 0269 lines.append(clead + text[p1:p2] + trail) 0270 0271 nlines += 1 0272 p += pl 0273 0274 if lenatoms == 0: # in case no text given, main loop did not run 0275 lines.append(flead + trail) 0276 0277 for i in range(len(lines)): # postprocess 0278 # Strip trailing whitespace if no trailing string or removal is forced. 0279 if not trail or remtrws: 0280 # Do not remove trailing whitespace which is part of leading string, 0281 # unless removal is forced. 0282 clead = "" 0283 if not remtrws: 0284 if i == 0: clead = flead 0285 else: clead = lead 0286 tmp = lines[i][len(clead):] 0287 lines[i] = clead + tmp.rstrip() 0288 if endl: 0289 lines[i] += endl 0290 0291 return lines 0292 0293 0294 def _atomize (text): 0295 """ 0296 Split text into atomic segments and compute their visual and raw widths. 0297 0298 Returns list of tuples 0299 (visual segment, visual length, visual position, raw length, raw position, 0300 raw visual length, raw visual position). 0301 The list always ends with zero-visual length segment, 0302 so that it is not empty even if the text is empty, 0303 and that last atom's positions are visual and raw lengths of the string. 0304 """ 0305 0306 atoms = [] 0307 isuc = isinstance(text, str) 0308 vsegf = getattr(text, "visual_segment", None) 0309 rpos = 0 0310 vpos = 0 0311 rvpos = 0 0312 rlentext = len(text) 0313 while rpos < rlentext: 0314 rlen = 0 0315 if vsegf: 0316 vseg, rlen = vsegf(rpos) 0317 if rlen == 0: 0318 vseg, rlen = text[rpos], 1 0319 vlen = len(vseg) 0320 rvlen = vlen 0321 if isuc and vlen: 0322 for c in vseg: 0323 if unicodedata.east_asian_width(c) in ("W", "F"): 0324 vlen += 1 # 1 = 2 minus (1 already counted) 0325 atoms.append((vseg, vlen, vpos, rlen, rpos, rvlen, rvpos)) 0326 vpos += vlen 0327 rpos += rlen 0328 rvpos += rvlen 0329 atoms.append((type(text)(""), 0, vpos, 0, rpos, 0, rvpos)) 0330 0331 return atoms 0332 0333 0334 def wrap_field (field, text, preseq=""): 0335 """ 0336 Wrap fields in PO messages. 0337 0338 This function can be sent as parameter to L{Message} and L{Catalog} 0339 methods and constructors. 0340 0341 @param field: the field keyword (C{"msgctxt"}, C{"msgid"}, ...) 0342 @type field: string 0343 0344 @param text: the text of the field 0345 @type text: string 0346 0347 @param preseq: 0348 the prefix to field keyword, usually for previous-value (C{"#|"}) 0349 and obsolete (C{"#~"}) fields 0350 @type preseq: string 0351 0352 @returns: wrapped field lines (each ends with a newline) 0353 @rtype: list of strings 0354 """ 0355 0356 return wrap_text(text, 79, 0357 flead=preseq+field+" \"", 0358 lead=preseq+"\"", 0359 trail="\"", 0360 natbr=_natbr_after, 0361 natbr2=_natbr_before, 0362 prebr=_prebr, 0363 postbr=_postbr, 0364 femp=True, 0365 wcolmin=39) 0366 0367 0368 def wrap_field_unwrap (field, text, preseq=""): 0369 """ 0370 Wrap fields in PO messages at unconditional breaks (no column-wrapping). 0371 0372 This function can be sent as parameter to L{Message} and L{Catalog} 0373 methods and constructors. 0374 0375 The parameters and return values are as for L{wrap_field}. 0376 0377 @see: L{wrap_field} 0378 """ 0379 0380 return wrap_text(text, 0, 0381 flead=preseq+field+" \"", 0382 lead=preseq+"\"", 0383 trail="\"", 0384 prebr=_prebr, 0385 postbr=_postbr, 0386 femp=True) 0387 0388 0389 def wrap_comment (ctype, text): 0390 """ 0391 Wrap comments in PO messages. 0392 0393 @param ctype: the comment type (C{"# "}, C{"#:"}, C{"#."}, ...) 0394 @type ctype: string 0395 0396 @param text: the text of the comment 0397 @type text: string 0398 0399 @returns: wrapped comment lines (each ends with a newline) 0400 @rtype: list of strings 0401 """ 0402 0403 return wrap_text(text, 79, 0404 lead="#"+ctype+" ", 0405 femp=False, 0406 midbr=False, 0407 remtrws=True) 0408 # midbr is False in order to prevent e.g. very long source references 0409 # being forced split in the middle. 0410 # remtrws is True in order to remove the trailing space in empty comments. 0411 0412 0413 def wrap_comment_unwrap (ctype, text): 0414 """ 0415 Wrap comments in PO messages at unconditional breaks (no column-wrapping). 0416 0417 The parameters and return values are as for L{wrap_comment}. 0418 0419 @see: L{wrap_comment} 0420 """ 0421 0422 return wrap_text(text, 0, 0423 lead="#"+ctype+" ", 0424 femp=False, 0425 remtrws=True) 0426 0427 0428 def wrap_field_fine (field, text, preseq=""): 0429 """ 0430 Wrap fields in PO messages, including breaks at selected markup elements. 0431 0432 This function can be sent as parameter to L{Message} and L{Catalog} 0433 methods and constructors. 0434 0435 The parameters and return values are as for L{wrap_field}. 0436 0437 @see: L{wrap_field} 0438 """ 0439 0440 return wrap_text(text, 79, 0441 flead=preseq+field+" \"", 0442 lead=preseq+"\"", 0443 trail="\"", 0444 natbr=_natbr_after, 0445 natbr2=_natbr_before, 0446 prebr=_prebr, 0447 postbr=_postbr, 0448 tagbr=_tagbr_normal, 0449 tagbr2=_tagbr_inplace, 0450 femp=True) 0451 0452 0453 def wrap_field_fine_unwrap (field, text, preseq=""): 0454 """ 0455 Wrap fields in PO messages, including breaks at selected markup elements, 0456 but only at unconditional breaks (no column-wrapping). 0457 0458 This function can be sent as parameter to L{Message} and L{Catalog} 0459 methods and constructors. 0460 0461 The parameters and return values are as for L{wrap_field}. 0462 0463 @see: L{wrap_field} 0464 """ 0465 0466 return wrap_text(text, 0, 0467 flead=preseq+field+" \"", 0468 lead=preseq+"\"", 0469 trail="\"", 0470 prebr=_prebr, 0471 postbr=_postbr, 0472 tagbr=_tagbr_normal, 0473 tagbr2=_tagbr_inplace, 0474 femp=True) 0475 0476 0477 def select_field_wrapper (wrapkw): 0478 """ 0479 Select wrap function for PO message fields based on keywords. 0480 0481 Wrap function is selected by specifying a sequence of keywords, 0482 from the following set: 0483 - C{"basic"}: wrapping on column count 0484 - C{"fine"}: wrapping on logical breaks (such as C{<p>} or C{<para>} tags) 0485 Wrapping on newline characters is always engaged. 0486 If C{wrapkw} is given as C{None}, C{"basic"} only is assumed. 0487 0488 @param wrapkw: wrapping keywords 0489 @type wrapkw: sequence of strings or C{None} 0490 0491 @returns: wrapping function 0492 @rtype: (string, string, string?)->[string] 0493 0494 @see: L{wrap_field} 0495 """ 0496 0497 if wrapkw is None: 0498 wrapkw = ["basic"] 0499 0500 if "basic" in wrapkw: 0501 if "fine" in wrapkw: 0502 wrapf = wrap_field_fine 0503 else: 0504 wrapf = wrap_field 0505 else: 0506 if "fine" in wrapkw: 0507 wrapf = wrap_field_fine_unwrap 0508 else: 0509 wrapf = wrap_field_unwrap 0510 0511 return wrapf 0512 0513 0514 def select_field_wrapping (cfgsec=None, cat=None, cmlopt=None): 0515 """ 0516 Select wrapping keywords for PO message fields based on various inputs. 0517 0518 There are three possible sources of wrapping information: 0519 - a user configuration section, possibly containing wrapping fields 0520 - the catalog to which the wrapping should be applied, 0521 possibly defining wrapping in its header 0522 - command line options for wrapping 0523 This function will examine these three sources with increasing priority, 0524 and return a tuple of applicable L{wrapping keywords<select_field_wrapper>}. 0525 Any of these sources can also be omitted; 0526 if all are omitted, C{("basic",)} is returned. 0527 0528 @param cfgsec: a section of user configuration 0529 @type cfgsec: L{section<config.section>} 0530 @param cat: the catalog to be wrapped 0531 @type cat: L{Catalog<catalog.Catalog>} 0532 @param cmlopt: command line options 0533 @type cmlopt: optparse.ConfigParser 0534 0535 @returns: wrapping keywords 0536 @rtype: (string*) 0537 0538 @see: L{select_field_wrapper} 0539 """ 0540 0541 # Default wrapping. 0542 wrapping = ["basic"] 0543 0544 # Helper to remove and add wrapping types. 0545 def waddrem (add, wtype): 0546 if add is False and wtype in wrapping: 0547 wrapping.remove(wtype) 0548 elif add is True and wtype not in wrapping: 0549 wrapping.append(wtype) 0550 0551 # Restrict wrapping in following priority of overrides. 0552 # - configuration 0553 if cfgsec is not None: 0554 waddrem(cfgsec.boolean("wrap", None), "basic") 0555 waddrem(cfgsec.boolean("fine-wrap", None), "fine") 0556 # - catalog 0557 wrapping_cat = cat.wrapping() if cat is not None else None 0558 if wrapping_cat is not None: 0559 waddrem("basic" in wrapping_cat, "basic") 0560 waddrem("fine" in wrapping_cat, "fine") 0561 # - command line 0562 if cmlopt is not None: 0563 waddrem(cmlopt.do_wrap, "basic") 0564 waddrem(cmlopt.do_fine_wrap, "fine") 0565 0566 return tuple(sorted(wrapping)) 0567