File indexing completed on 2024-11-03 11:24:03

0001 # -*- coding: UTF-8 -*-
0002 
0003 """
0004 Splitting message fields into syntactical elements.
0005 
0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0007 @license: GPLv3
0008 """
0009 
0010 import re
0011 
0012 from pology.resolve import remove_accelerator
0013 
0014 
0015 _word_rxp = r"(?:\w+[']\w+|\w+)"
0016 _split_rx = re.compile(r"[^\w]+|%s" % _word_rxp, re.U)
0017 _split_rx_markup = re.compile(r"[^\w]*(<.*?>|&[\w.:-]+;|&#x?\d+;)[^\w<&]*"
0018                               r"|[^\w]+|%s" % _word_rxp, re.U)
0019 _word_rx = re.compile(r"^\w", re.U)
0020 
0021 
0022 def split_text (text, markup=False, format=None):
0023     """
0024     Split text into words and intersections.
0025 
0026     The text is split into lists of words and intersections (inter-word
0027     segments), such that there is always an intersection before the first and
0028     after the last word, even if empty. That is, there is always one more of
0029     interesections than of words.
0030 
0031     The text may contain C{<...>} tags, and be of certain format supported
0032     by Gettext (e.g. C{c-format}). If specified, these elements may influence
0033     splitting.
0034 
0035     @param text: the text to split
0036     @type text: string
0037 
0038     @param markup: whether text contains markup tags
0039     @type markup: bool
0040 
0041     @param format: Gettext format flag
0042     @type format: None or string
0043 
0044     @returns: words and intersections
0045     @rtype: list of strings, list of strings
0046     """
0047 
0048     if markup:
0049         split_rx = _split_rx_markup
0050         word_rx = _word_rx
0051     else:
0052         split_rx = _split_rx
0053         word_rx = _word_rx
0054 
0055     words = []
0056     intrs = []
0057     lastword = False
0058     for m in split_rx.finditer(text):
0059         seg = m.group(0)
0060         if word_rx.search(seg):
0061             if lastword and words:
0062                 words[-1] += seg
0063             else:
0064                 words.append(seg)
0065             lastword = True
0066         else:
0067             if not lastword and intrs:
0068                 intrs[-1] += seg
0069             else:
0070                 intrs.append(seg)
0071             lastword = False
0072 
0073     if lastword:
0074         intrs.append("")
0075     if len(intrs) == len(words):
0076         intrs.insert(0, "")
0077 
0078     if format == "c-format":
0079         words, intrs = _mod_on_format_c(words, intrs)
0080     elif format == "qt-format":
0081         words, intrs = _mod_on_format_qt(words, intrs)
0082 
0083     return words, intrs
0084 
0085 
0086 _mf_c_rx = re.compile(r"(?:^|[^%])(% ?)$")
0087 
0088 def _mod_on_format_c (words, intrs):
0089 
0090     for i in range(len(words)):
0091         m = _mf_c_rx.search(intrs[i])
0092         if m:
0093             dirst = m.group(1)
0094             intrs[i] = intrs[i][:-len(dirst)]
0095             words[i] = dirst + words[i]
0096 
0097     return words, intrs
0098 
0099 
0100 _mf_qt_rx = re.compile(r"^L?\d")
0101 
0102 def _mod_on_format_qt (words, intrs):
0103 
0104     for i in range(len(words)):
0105         if intrs[i].endswith("%") and _mf_qt_rx.search(words[i]):
0106             intrs[i] = intrs[i][:-1]
0107             words[i] = "%" + words[i]
0108 
0109     return words, intrs
0110 
0111 
0112 # Regexes for text removals to get proper words.
0113 # Second member of the tuple is the replacement string.
0114 _r_url_rx = (re.compile(r"[a-zA-Z0-9.+-]+://[^\s]*"
0115                         r"|www\.[\w.-]{1,250}"
0116                         r"|\b[\w.-]+\.[a-z]{2,3}\b"
0117                        , re.I|re.U), "")
0118 _r_email_rx = (re.compile(r"\b[\w.-]+@[\w.-]+", re.U), "")
0119 _r_shvar_rx = (re.compile(r"\$(\w+|\{.*?\})", re.U), "")
0120 _r_shopt_rx = (re.compile(r"(^|[^\w])(--|-|/)[\w-]+", re.U), "")
0121 _r_tags_rx = (re.compile(r"<.*?>"), " ")
0122 _r_ents_rx = (re.compile(r"&[\w.:-]+;", re.U), " ")
0123 _r_numents_rx = (re.compile(r"&#x?\d+;"), " ")
0124 _r_digits_rx = (re.compile(r"[\d⁰¹²³⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉]+"), " ")
0125 
0126 _r_fmtd_c_rx = (re.compile(r"(?<!%)%($\d+)?[+ ]?(\d+)?\.?(\d+)?[a-z]"), "", "c")
0127 _r_fmtd_qt_rx = (re.compile(r"%\d+"), "", "qt")
0128 _r_fmtd_python_rx = (re.compile(r"(?<!%)%\(\w+\)[a-z]"), "", "python")
0129 
0130 _remove_xml_rxs = [
0131     _r_tags_rx, # before entities
0132     _r_ents_rx,
0133     _r_numents_rx,
0134 ]
0135 _remove_rxs = [
0136     _r_email_rx, # before URLs
0137     _r_url_rx,
0138     _r_shvar_rx,
0139     _r_shopt_rx,
0140     _r_digits_rx,
0141 ]
0142 _remove_fmtd_rxs = [
0143     _r_fmtd_c_rx,
0144     _r_fmtd_qt_rx,
0145     _r_fmtd_python_rx,
0146 ]
0147 
0148 # Pass words when:
0149 # - no underscores
0150 _word_ok_rx = re.compile(r"^[^_]*$", re.U)
0151 
0152 
0153 def proper_words (text, markup=False, accels=None, format=None):
0154     """
0155     Mine proper words out of the text.
0156 
0157     The proper words are those one would expect to find in a dictionary,
0158     or at least having that latent quality (jargon, etc.)
0159     As opposed to URLs, email addresses, shell variables, etc.
0160 
0161     The text may contain XML-like markup (C{<...>} tags, entities...),
0162     or keyboard accelerator markers.
0163     It may also be of certain format known to Gettext (e.g. C{c-format}).
0164     If specified, these elements may influence splitting.
0165 
0166     @param text: the text to split
0167     @type text: string
0168 
0169     @param markup: whether text contains markup tags
0170     @type markup: bool
0171 
0172     @param accels: accelerator characters to ignore
0173     @type accels: sequence
0174 
0175     @param format: Gettext format flag
0176     @type format: None or string
0177 
0178     @returns: proper words
0179     @rtype: list of strings
0180     """
0181 
0182     # Remove markup.
0183     # (before format directives)
0184     if markup:
0185         for rem_rx, sub in _remove_xml_rxs:
0186             text = rem_rx.sub(sub, text)
0187 
0188     # Remove format directives.
0189     # (before general non-words)
0190     if format:
0191         for rem_rx, sub, clng in _remove_fmtd_rxs:
0192             if format.startswith(clng + "-"):
0193                 text = rem_rx.sub(sub, text)
0194 
0195     # Remove general known non-words.
0196     for rem_rx, sub in _remove_rxs:
0197         text = rem_rx.sub(sub, text)
0198 
0199     # Remove accelerators (must come after other replacements).
0200     text = remove_accelerator(text, accels, greedy=True)
0201 
0202     rwords = split_text(text)[0]
0203     words = [x for x in rwords if _word_ok_rx.search(x)]
0204 
0205     return words