pology/pology/wrap.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Text wrapping, with special handling for typical texts in PO files.
0005
0006 Wrapping turns out to be quite a non-trivial matter.
0007 Gettext itself implements an intricate wrapping algorithm from the Unicode
0008 consortium, with its own tweaks, which is hard to beat in any simpler way.
0009 Thus, do not be surprised if the wrapping quality offered by this module does
0010 not meet your exact needs.
0011
0012 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0013 @license: GPLv3
0014 """
0015
0016 import re
0017 import unicodedata
0018
0019 from pology import PologyError, _, n_
0020
0021
0022 # Regex for splitting C{<...>} into tag name and few other elements.
0023 _tag_split_rx = re.compile(r"^\s*<\s*(/?)\s*(\w+)[^/>]*(/?)\s*>\s*$")
0024
0025 # Characters for "natural" breaks where to wrap the text.
0026 _natbr_after = ".,;/-)]}"
0027 _natbr_before = "%({["
0028
0029 # Strings at which the text should be wrapped before or after.
0030 _prebr = ("|/|",)
0031 _postbr = (("\\n", "\\\\n"), "|/|")
0032 # |/| is the Transcript fence, should break both before and after.
0033
0034 # Tags for normal breaking (after the closed tag)
0035 _tagbr_normal = (
0036     # HTML
0037     "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li",
0038     "table", "th", "td", "tr", "center", "blockquote", "pre", "dd", "dl", "dt",
0039     # KUIT
0040     "title", "subtitle", "para", "list", "item",
0041     # Docbook
0042     "calloutlist", "glosslist", "itemizedlist", "orderedlist", "segmentedlist",
0043     "simplelist", "variablelist", "listitem", "seglistitem", "varlistentry",
0044 )
0045
0046 # Tags usually closed in-place in strict XML, break before and after.
0047 _tagbr_inplace = (
0048     # HTML
0049     "br", "hr",
0050     # KUIT
0051     "nl",
0052 )
0053
0054
0055 def _tag_split (tag):
0056     """
0057     Split tag statement into tag name and a state string.
0058
0059     State is one of "open" (<foo>), "close" (</foo>), or "inplace" (<foo/>).
0060
0061     @param tag: the tag proper, C{<...>}
0062     @type tag: string
0063     @returns: tag name and state
0064     @rtype: string, string
0065     """
0066
0067     m = _tag_split_rx.match(tag)
0068     if m:
0069         if m.group(1):
0070             state = "close"
0071         elif m.group(3):
0072             state = "inplace"
0073         else:
0074             state = "open"
0075         return m.group(2), state
0076     else:
0077         return "", ""
0078
0079
0080 def wrap_text (text, wcol=79, lead="", trail="", flead=None, femp=False,
0081                natbr="", natbr2="", prebr=(), postbr=(), tagbr=(), tagbr2=(),
0082                wcolmin=0, midbr=True, remtrws=False, endl="\n"):
0083     """
0084     Wrap text into lines.
0085
0086     Wrapping behavior and positions can be controlled by several parameters.
0087     Trailing and leading strings can be added to each wrapped line, including
0088     a special lead for the first line.
0089
0090     If wrapping column is given as less or equal to zero, the lines are split
0091     only at unconditional breaks.
0092
0093     This is a very general wrapping function, see the more specialized ones in
0094     this module for practical use with PO message elements.
0095
0096     @param text: the text to wrap
0097     @type text: string
0098     @param wcol: column to wrap after
0099     @type wcol: int
0100     @param lead: prefix for each line
0101     @type lead: string
0102     @param trail: suffix for each line
0103     @type trail: string
0104     @param flead:
0105         special suffix for the first line. Normal suffix is used if this is
0106         given as C{None}
0107     @type flead: C{None} or string
0108     @param femp:
0109         C{True} to leave the first line empty if the complete text would not
0110         fit into it, C{False} for normal use of the first line
0111     @type femp: bool
0112     @param natbr: characters other than space to naturally break at
0113     @type natbr: string
0114     @param natbr2: characters other than space to naturally break at,
0115         also taking the breaking character to the next line
0116     @type natbr2: string
0117     @param prebr: character sequences to unconditionally break before
0118     @type prebr: (string*)
0119     @param postbr: character sequences to unconditionally break after
0120     @type postbr: (string*)
0121     @param tagbr: tag names to break before opening and after closing
0122     @type tagbr: (string*)
0123     @param tagbr2: tag names to always break after (like <br>)
0124     @type tagbr2: (string*)
0125     @param wcolmin: minimal column to allow natural breaks after
0126     @type wcolmin: int
0127     @param midbr:
0128         C{True} to allow break in the middle of a word if no usual break
0129         found before C{wcol} has been exceeded
0130     @type midbr: bool
0131     @param remtrws:
0132         whether to strictly remove any trailing whitespace in wrapped lines
0133         (otherwise trailing whitespace may be left in under certain conditions)
0134     @type remtrws: bool
0135     @param endl: line end marker for each line
0136     @type endl: string
0137     @returns: wrapped lines
0138     @rtype: [string*]
0139     """
0140
0141     if flead is None:
0142         flead = lead
0143
0144     rlentext = len(text)
0145     atoms = _atomize(text)[:-1] # strip sentry
0146     vlenlead = _atomize(lead)[-1][2]
0147     vlentrail = _atomize(trail)[-1][2]
0148     vlenflead = _atomize(flead)[-1][2]
0149
0150     if wcol > 0 and vlenlead + vlentrail + 1 >= wcol:
0151         raise PologyError(
0152             _("@info",
0153               "Wrapping is too tight, cannot fit leading and trailing text."))
0154
0155     lines = [] # list of lines
0156     nlines = 0
0157     lenatoms = len(atoms)
0158     p = 0 # position into original text by atoms
0159     vtext = "".join(x[0] for x in atoms)
0160     vposs = tuple(x[2] for x in atoms)
0161     rvposs = tuple(x[6] for x in atoms)
0162     while p < lenatoms:
0163         # Determine effective wrapping column for this line.
0164         ewcol = wcol - 1 - vlentrail # -1 for newline character
0165         if nlines == 0:
0166             clead = flead
0167             ewcol -= vlenflead
0168         else:
0169             clead = lead
0170             ewcol -= vlenlead
0171
0172         # Find where to wrap.
0173         atbr = False # immediate break found
0174         pl = 0 # position into current line
0175         ple = 0 #b apparent position into current line
0176         pl_ok = 0 # last good position into current line (where wrap was fine)
0177         ple_ok = 0 # last good apparent position into current line
0178         pvseg, pvlen = "", 0
0179         while (    p + pl < lenatoms
0180                and (ple <= ewcol or wcol <= 0 or (not midbr and pl_ok == 0))
0181                and not atbr
0182         ):
0183             if pl > 0:
0184                 pvseg, pvlen = atoms[p + pl - 1][:2]
0185             cvseg, cvlen = atoms[p + pl][:2]
0186             if postbr or tagbr or tagbr2: # condition for optimization
0187                 backvtext = vtext[rvposs[p]:rvposs[p + pl]]
0188             if prebr or tagbr: # condition for optimization
0189                 forevtext = vtext[rvposs[p + pl]:]
0190
0191             # Immediate breaks allowed only after
0192             # at least one visually non-empty atom.
0193             if vposs[p + pl] > vposs[p]:
0194
0195                 # Check for an immediate break by sequence.
0196                 for br in postbr:
0197                     if not isinstance(br, tuple):
0198                         if backvtext.endswith(br):
0199                             atbr = True; break
0200                     else:
0201                         br1, br2 = br
0202                         if (    backvtext.endswith(br1)
0203                             and not backvtext.endswith(br2)
0204                         ):
0205                             atbr = True; break
0206                 if atbr: break
0207                 for br in prebr:
0208                     if forevtext.startswith(br):
0209                         atbr = True; break
0210                 if atbr: break
0211
0212                 # Check for an immediate break by tag.
0213                 if tagbr or tagbr2:
0214                     if backvtext.endswith(">"):
0215                         pt = backvtext.rfind("<", 0, -1)
0216                         if pt >= 0:
0217                             tag, state = _tag_split(backvtext[pt:])
0218                             if (   (tag in tagbr2)
0219                                 or (    tag in tagbr
0220                                     and state in ("close", "inplace"))
0221                             ):
0222                                 atbr = True; break
0223                 if tagbr:
0224                     if forevtext.startswith("<"):
0225                         pt = forevtext.find(">", 1)
0226                         if pt >= 0:
0227                             tag, state = _tag_split(forevtext[:pt+1])
0228                             if tag in tagbr and state == "open":
0229                                 atbr = True; break
0230
0231             # Check for valid natural break.
0232             if (   pvseg in " "
0233                 or (cvseg != " " and pvseg in natbr and cvseg not in natbr)
0234                 or cvseg in natbr2
0235             ):
0236                 pl_ok = pl
0237                 ple_ok = ple
0238
0239             ple += pvlen
0240             pl += 1
0241
0242         # If not unconditional break, still enough text, and break possible.
0243         if not atbr and ple > ewcol and ewcol > 0:
0244             # Don't allow too short natural break.
0245             if ple_ok > wcolmin:
0246                 pl = pl_ok
0247                 ple = ple_ok
0248             # Backstep any segments still too much if mid-word break allowed.
0249             if midbr:
0250                 while pl > 1 and ple > ewcol:
0251                     pl -= 1
0252                     ple -= atoms[pl][1]
0253
0254         # Never break after non-final backslash.
0255         if p + pl < lenatoms:
0256             while pl > 1 and atoms[p + pl - 1][0] == "\\":
0257                 pl -= 1
0258                 ple -= atoms[p + pl][1]
0259
0260         if (    nlines == 0
0261             and ((femp and p + pl < lenatoms) or (ewcol <= 0 and wcol > 0))
0262         ):
0263             # leaving first line empty
0264             lines.append(clead + trail)
0265             pl = 0
0266         else:
0267             p1 = atoms[p][4]
0268             p2 = atoms[p + pl][4] if p + pl < lenatoms else rlentext
0269             lines.append(clead + text[p1:p2] + trail)
0270
0271         nlines += 1
0272         p += pl
0273
0274     if lenatoms == 0: # in case no text given, main loop did not run
0275         lines.append(flead + trail)
0276
0277     for i in range(len(lines)): # postprocess
0278         # Strip trailing whitespace if no trailing string or removal is forced.
0279         if not trail or remtrws:
0280             # Do not remove trailing whitespace which is part of leading string,
0281             # unless removal is forced.
0282             clead = ""
0283             if not remtrws:
0284                 if i == 0: clead = flead
0285                 else:      clead = lead
0286             tmp = lines[i][len(clead):]
0287             lines[i] = clead + tmp.rstrip()
0288         if endl:
0289             lines[i] += endl
0290
0291     return lines
0292
0293
0294 def _atomize (text):
0295     """
0296     Split text into atomic segments and compute their visual and raw widths.
0297
0298     Returns list of tuples
0299     (visual segment, visual length, visual position, raw length, raw position,
0300     raw visual length, raw visual position).
0301     The list always ends with zero-visual length segment,
0302     so that it is not empty even if the text is empty,
0303     and that last atom's positions are visual and raw lengths of the string.
0304     """
0305
0306     atoms = []
0307     isuc = isinstance(text, str)
0308     vsegf = getattr(text, "visual_segment", None)
0309     rpos = 0
0310     vpos = 0
0311     rvpos = 0
0312     rlentext = len(text)
0313     while rpos < rlentext:
0314         rlen = 0
0315         if vsegf:
0316             vseg, rlen = vsegf(rpos)
0317         if rlen == 0:
0318             vseg, rlen = text[rpos], 1
0319         vlen = len(vseg)
0320         rvlen = vlen
0321         if isuc and vlen:
0322             for c in vseg:
0323                 if unicodedata.east_asian_width(c) in ("W", "F"):
0324                     vlen += 1 # 1 = 2 minus (1 already counted)
0325         atoms.append((vseg, vlen, vpos, rlen, rpos, rvlen, rvpos))
0326         vpos += vlen
0327         rpos += rlen
0328         rvpos += rvlen
0329     atoms.append((type(text)(""), 0, vpos, 0, rpos, 0, rvpos))
0330
0331     return atoms
0332
0333
0334 def wrap_field (field, text, preseq=""):
0335     """
0336     Wrap fields in PO messages.
0337
0338     This function can be sent as parameter to L{Message} and L{Catalog}
0339     methods and constructors.
0340
0341     @param field: the field keyword (C{"msgctxt"}, C{"msgid"}, ...)
0342     @type field: string
0343
0344     @param text: the text of the field
0345     @type text: string
0346
0347     @param preseq:
0348         the prefix to field keyword, usually for previous-value (C{"#|"})
0349         and obsolete (C{"#~"}) fields
0350     @type preseq: string
0351
0352     @returns: wrapped field lines (each ends with a newline)
0353     @rtype: list of strings
0354     """
0355
0356     return wrap_text(text, 79,
0357                      flead=preseq+field+" \"",
0358                      lead=preseq+"\"",
0359                      trail="\"",
0360                      natbr=_natbr_after,
0361                      natbr2=_natbr_before,
0362                      prebr=_prebr,
0363                      postbr=_postbr,
0364                      femp=True,
0365                      wcolmin=39)
0366
0367
0368 def wrap_field_unwrap (field, text, preseq=""):
0369     """
0370     Wrap fields in PO messages at unconditional breaks (no column-wrapping).
0371
0372     This function can be sent as parameter to L{Message} and L{Catalog}
0373     methods and constructors.
0374
0375     The parameters and return values are as for L{wrap_field}.
0376
0377     @see: L{wrap_field}
0378     """
0379
0380     return wrap_text(text, 0,
0381                      flead=preseq+field+" \"",
0382                      lead=preseq+"\"",
0383                      trail="\"",
0384                      prebr=_prebr,
0385                      postbr=_postbr,
0386                      femp=True)
0387
0388
0389 def wrap_comment (ctype, text):
0390     """
0391     Wrap comments in PO messages.
0392
0393     @param ctype: the comment type (C{"# "}, C{"#:"}, C{"#."}, ...)
0394     @type ctype: string
0395
0396     @param text: the text of the comment
0397     @type text: string
0398
0399     @returns: wrapped comment lines (each ends with a newline)
0400     @rtype: list of strings
0401     """
0402
0403     return wrap_text(text, 79,
0404                      lead="#"+ctype+" ",
0405                      femp=False,
0406                      midbr=False,
0407                      remtrws=True)
0408     # midbr is False in order to prevent e.g. very long source references
0409     # being forced split in the middle.
0410     # remtrws is True in order to remove the trailing space in empty comments.
0411
0412
0413 def wrap_comment_unwrap (ctype, text):
0414     """
0415     Wrap comments in PO messages at unconditional breaks (no column-wrapping).
0416
0417     The parameters and return values are as for L{wrap_comment}.
0418
0419     @see: L{wrap_comment}
0420     """
0421
0422     return wrap_text(text, 0,
0423                      lead="#"+ctype+" ",
0424                      femp=False,
0425                      remtrws=True)
0426
0427
0428 def wrap_field_fine (field, text, preseq=""):
0429     """
0430     Wrap fields in PO messages, including breaks at selected markup elements.
0431
0432     This function can be sent as parameter to L{Message} and L{Catalog}
0433     methods and constructors.
0434
0435     The parameters and return values are as for L{wrap_field}.
0436
0437     @see: L{wrap_field}
0438     """
0439
0440     return wrap_text(text, 79,
0441                      flead=preseq+field+" \"",
0442                      lead=preseq+"\"",
0443                      trail="\"",
0444                      natbr=_natbr_after,
0445                      natbr2=_natbr_before,
0446                      prebr=_prebr,
0447                      postbr=_postbr,
0448                      tagbr=_tagbr_normal,
0449                      tagbr2=_tagbr_inplace,
0450                      femp=True)
0451
0452
0453 def wrap_field_fine_unwrap (field, text, preseq=""):
0454     """
0455     Wrap fields in PO messages, including breaks at selected markup elements,
0456     but only at unconditional breaks (no column-wrapping).
0457
0458     This function can be sent as parameter to L{Message} and L{Catalog}
0459     methods and constructors.
0460
0461     The parameters and return values are as for L{wrap_field}.
0462
0463     @see: L{wrap_field}
0464     """
0465
0466     return wrap_text(text, 0,
0467                      flead=preseq+field+" \"",
0468                      lead=preseq+"\"",
0469                      trail="\"",
0470                      prebr=_prebr,
0471                      postbr=_postbr,
0472                      tagbr=_tagbr_normal,
0473                      tagbr2=_tagbr_inplace,
0474                      femp=True)
0475
0476
0477 def select_field_wrapper (wrapkw):
0478     """
0479     Select wrap function for PO message fields based on keywords.
0480
0481     Wrap function is selected by specifying a sequence of keywords,
0482     from the following set:
0483       - C{"basic"}: wrapping on column count
0484       - C{"fine"}: wrapping on logical breaks (such as C{<p>} or C{<para>} tags)
0485     Wrapping on newline characters is always engaged.
0486     If C{wrapkw} is given as C{None}, C{"basic"} only is assumed.
0487
0488     @param wrapkw: wrapping keywords
0489     @type wrapkw: sequence of strings or C{None}
0490
0491     @returns: wrapping function
0492     @rtype: (string, string, string?)->[string]
0493
0494     @see: L{wrap_field}
0495     """
0496
0497     if wrapkw is None:
0498         wrapkw = ["basic"]
0499
0500     if "basic" in wrapkw:
0501         if "fine" in wrapkw:
0502             wrapf = wrap_field_fine
0503         else:
0504             wrapf = wrap_field
0505     else:
0506         if "fine" in wrapkw:
0507             wrapf = wrap_field_fine_unwrap
0508         else:
0509             wrapf = wrap_field_unwrap
0510
0511     return wrapf
0512
0513
0514 def select_field_wrapping (cfgsec=None, cat=None, cmlopt=None):
0515     """
0516     Select wrapping keywords for PO message fields based on various inputs.
0517
0518     There are three possible sources of wrapping information:
0519       - a user configuration section, possibly containing wrapping fields
0520       - the catalog to which the wrapping should be applied,
0521         possibly defining wrapping in its header
0522       - command line options for wrapping
0523     This function will examine these three sources with increasing priority,
0524     and return a tuple of applicable L{wrapping keywords<select_field_wrapper>}.
0525     Any of these sources can also be omitted;
0526     if all are omitted, C{("basic",)} is returned.
0527
0528     @param cfgsec: a section of user configuration
0529     @type cfgsec: L{section<config.section>}
0530     @param cat: the catalog to be wrapped
0531     @type cat: L{Catalog<catalog.Catalog>}
0532     @param cmlopt: command line options
0533     @type cmlopt: optparse.ConfigParser
0534
0535     @returns: wrapping keywords
0536     @rtype: (string*)
0537
0538     @see: L{select_field_wrapper}
0539     """
0540
0541     # Default wrapping.
0542     wrapping = ["basic"]
0543
0544     # Helper to remove and add wrapping types.
0545     def waddrem (add, wtype):
0546         if add is False and wtype in wrapping:
0547             wrapping.remove(wtype)
0548         elif add is True and wtype not in wrapping:
0549             wrapping.append(wtype)
0550
0551     # Restrict wrapping in following priority of overrides.
0552     # - configuration
0553     if cfgsec is not None:
0554         waddrem(cfgsec.boolean("wrap", None), "basic")
0555         waddrem(cfgsec.boolean("fine-wrap", None), "fine")
0556     # - catalog
0557     wrapping_cat = cat.wrapping() if cat is not None else None
0558     if wrapping_cat is not None:
0559         waddrem("basic" in wrapping_cat, "basic")
0560         waddrem("fine" in wrapping_cat, "fine")
0561     # - command line
0562     if cmlopt is not None:
0563         waddrem(cmlopt.do_wrap, "basic")
0564         waddrem(cmlopt.do_fine_wrap, "fine")
0565
0566     return tuple(sorted(wrapping))
0567