pology/pology/split.py

0001 # -*- coding: UTF-8 -*-
0002
0003 """
0004 Splitting message fields into syntactical elements.
0005
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009
0010 import re
0011
0012 from pology.resolve import remove_accelerator
0013
0014
0015 _word_rxp = r"(?:\w+[']\w+|\w+)"
0016 _split_rx = re.compile(r"[^\w]+|%s" % _word_rxp, re.U)
0017 _split_rx_markup = re.compile(r"[^\w]*(<.*?>|&[\w.:-]+;|&#x?\d+;)[^\w<&]*"
0018                               r"|[^\w]+|%s" % _word_rxp, re.U)
0019 _word_rx = re.compile(r"^\w", re.U)
0020
0021
0022 def split_text (text, markup=False, format=None):
0023     """
0024     Split text into words and intersections.
0025
0026     The text is split into lists of words and intersections (inter-word
0027     segments), such that there is always an intersection before the first and
0028     after the last word, even if empty. That is, there is always one more of
0029     interesections than of words.
0030
0031     The text may contain C{<...>} tags, and be of certain format supported
0032     by Gettext (e.g. C{c-format}). If specified, these elements may influence
0033     splitting.
0034
0035     @param text: the text to split
0036     @type text: string
0037
0038     @param markup: whether text contains markup tags
0039     @type markup: bool
0040
0041     @param format: Gettext format flag
0042     @type format: None or string
0043
0044     @returns: words and intersections
0045     @rtype: list of strings, list of strings
0046     """
0047
0048     if markup:
0049         split_rx = _split_rx_markup
0050         word_rx = _word_rx
0051     else:
0052         split_rx = _split_rx
0053         word_rx = _word_rx
0054
0055     words = []
0056     intrs = []
0057     lastword = False
0058     for m in split_rx.finditer(text):
0059         seg = m.group(0)
0060         if word_rx.search(seg):
0061             if lastword and words:
0062                 words[-1] += seg
0063             else:
0064                 words.append(seg)
0065             lastword = True
0066         else:
0067             if not lastword and intrs:
0068                 intrs[-1] += seg
0069             else:
0070                 intrs.append(seg)
0071             lastword = False
0072
0073     if lastword:
0074         intrs.append("")
0075     if len(intrs) == len(words):
0076         intrs.insert(0, "")
0077
0078     if format == "c-format":
0079         words, intrs = _mod_on_format_c(words, intrs)
0080     elif format == "qt-format":
0081         words, intrs = _mod_on_format_qt(words, intrs)
0082
0083     return words, intrs
0084
0085
0086 _mf_c_rx = re.compile(r"(?:^|[^%])(% ?)$")
0087
0088 def _mod_on_format_c (words, intrs):
0089
0090     for i in range(len(words)):
0091         m = _mf_c_rx.search(intrs[i])
0092         if m:
0093             dirst = m.group(1)
0094             intrs[i] = intrs[i][:-len(dirst)]
0095             words[i] = dirst + words[i]
0096
0097     return words, intrs
0098
0099
0100 _mf_qt_rx = re.compile(r"^L?\d")
0101
0102 def _mod_on_format_qt (words, intrs):
0103
0104     for i in range(len(words)):
0105         if intrs[i].endswith("%") and _mf_qt_rx.search(words[i]):
0106             intrs[i] = intrs[i][:-1]
0107             words[i] = "%" + words[i]
0108
0109     return words, intrs
0110
0111
0112 # Regexes for text removals to get proper words.
0113 # Second member of the tuple is the replacement string.
0114 _r_url_rx = (re.compile(r"[a-zA-Z0-9.+-]+://[^\s]*"
0115                         r"|www\.[\w.-]{1,250}"
0116                         r"|\b[\w.-]+\.[a-z]{2,3}\b"
0117                        , re.I|re.U), "")
0118 _r_email_rx = (re.compile(r"\b[\w.-]+@[\w.-]+", re.U), "")
0119 _r_shvar_rx = (re.compile(r"\$(\w+|\{.*?\})", re.U), "")
0120 _r_shopt_rx = (re.compile(r"(^|[^\w])(--|-|/)[\w-]+", re.U), "")
0121 _r_tags_rx = (re.compile(r"<.*?>"), " ")
0122 _r_ents_rx = (re.compile(r"&[\w.:-]+;", re.U), " ")
0123 _r_numents_rx = (re.compile(r"&#x?\d+;"), " ")
0124 _r_digits_rx = (re.compile(r"[\d⁰¹²³⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉]+"), " ")
0125
0126 _r_fmtd_c_rx = (re.compile(r"(?<!%)%($\d+)?[+ ]?(\d+)?\.?(\d+)?[a-z]"), "", "c")
0127 _r_fmtd_qt_rx = (re.compile(r"%\d+"), "", "qt")
0128 _r_fmtd_python_rx = (re.compile(r"(?<!%)%\(\w+\)[a-z]"), "", "python")
0129
0130 _remove_xml_rxs = [
0131     _r_tags_rx, # before entities
0132     _r_ents_rx,
0133     _r_numents_rx,
0134 ]
0135 _remove_rxs = [
0136     _r_email_rx, # before URLs
0137     _r_url_rx,
0138     _r_shvar_rx,
0139     _r_shopt_rx,
0140     _r_digits_rx,
0141 ]
0142 _remove_fmtd_rxs = [
0143     _r_fmtd_c_rx,
0144     _r_fmtd_qt_rx,
0145     _r_fmtd_python_rx,
0146 ]
0147
0148 # Pass words when:
0149 # - no underscores
0150 _word_ok_rx = re.compile(r"^[^_]*$", re.U)
0151
0152
0153 def proper_words (text, markup=False, accels=None, format=None):
0154     """
0155     Mine proper words out of the text.
0156
0157     The proper words are those one would expect to find in a dictionary,
0158     or at least having that latent quality (jargon, etc.)
0159     As opposed to URLs, email addresses, shell variables, etc.
0160
0161     The text may contain XML-like markup (C{<...>} tags, entities...),
0162     or keyboard accelerator markers.
0163     It may also be of certain format known to Gettext (e.g. C{c-format}).
0164     If specified, these elements may influence splitting.
0165
0166     @param text: the text to split
0167     @type text: string
0168
0169     @param markup: whether text contains markup tags
0170     @type markup: bool
0171
0172     @param accels: accelerator characters to ignore
0173     @type accels: sequence
0174
0175     @param format: Gettext format flag
0176     @type format: None or string
0177
0178     @returns: proper words
0179     @rtype: list of strings
0180     """
0181
0182     # Remove markup.
0183     # (before format directives)
0184     if markup:
0185         for rem_rx, sub in _remove_xml_rxs:
0186             text = rem_rx.sub(sub, text)
0187
0188     # Remove format directives.
0189     # (before general non-words)
0190     if format:
0191         for rem_rx, sub, clng in _remove_fmtd_rxs:
0192             if format.startswith(clng + "-"):
0193                 text = rem_rx.sub(sub, text)
0194
0195     # Remove general known non-words.
0196     for rem_rx, sub in _remove_rxs:
0197         text = rem_rx.sub(sub, text)
0198
0199     # Remove accelerators (must come after other replacements).
0200     text = remove_accelerator(text, accels, greedy=True)
0201
0202     rwords = split_text(text)[0]
0203     words = [x for x in rwords if _word_ok_rx.search(x)]
0204
0205     return words