File indexing completed on 2024-11-03 11:24:03
0001 # -*- coding: UTF-8 -*- 0002 0003 """ 0004 Splitting message fields into syntactical elements. 0005 0006 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0007 @license: GPLv3 0008 """ 0009 0010 import re 0011 0012 from pology.resolve import remove_accelerator 0013 0014 0015 _word_rxp = r"(?:\w+[']\w+|\w+)" 0016 _split_rx = re.compile(r"[^\w]+|%s" % _word_rxp, re.U) 0017 _split_rx_markup = re.compile(r"[^\w]*(<.*?>|&[\w.:-]+;|&#x?\d+;)[^\w<&]*" 0018 r"|[^\w]+|%s" % _word_rxp, re.U) 0019 _word_rx = re.compile(r"^\w", re.U) 0020 0021 0022 def split_text (text, markup=False, format=None): 0023 """ 0024 Split text into words and intersections. 0025 0026 The text is split into lists of words and intersections (inter-word 0027 segments), such that there is always an intersection before the first and 0028 after the last word, even if empty. That is, there is always one more of 0029 interesections than of words. 0030 0031 The text may contain C{<...>} tags, and be of certain format supported 0032 by Gettext (e.g. C{c-format}). If specified, these elements may influence 0033 splitting. 0034 0035 @param text: the text to split 0036 @type text: string 0037 0038 @param markup: whether text contains markup tags 0039 @type markup: bool 0040 0041 @param format: Gettext format flag 0042 @type format: None or string 0043 0044 @returns: words and intersections 0045 @rtype: list of strings, list of strings 0046 """ 0047 0048 if markup: 0049 split_rx = _split_rx_markup 0050 word_rx = _word_rx 0051 else: 0052 split_rx = _split_rx 0053 word_rx = _word_rx 0054 0055 words = [] 0056 intrs = [] 0057 lastword = False 0058 for m in split_rx.finditer(text): 0059 seg = m.group(0) 0060 if word_rx.search(seg): 0061 if lastword and words: 0062 words[-1] += seg 0063 else: 0064 words.append(seg) 0065 lastword = True 0066 else: 0067 if not lastword and intrs: 0068 intrs[-1] += seg 0069 else: 0070 intrs.append(seg) 0071 lastword = False 0072 0073 if lastword: 0074 intrs.append("") 0075 if len(intrs) == len(words): 0076 intrs.insert(0, "") 0077 0078 if format == "c-format": 0079 words, intrs = _mod_on_format_c(words, intrs) 0080 elif format == "qt-format": 0081 words, intrs = _mod_on_format_qt(words, intrs) 0082 0083 return words, intrs 0084 0085 0086 _mf_c_rx = re.compile(r"(?:^|[^%])(% ?)$") 0087 0088 def _mod_on_format_c (words, intrs): 0089 0090 for i in range(len(words)): 0091 m = _mf_c_rx.search(intrs[i]) 0092 if m: 0093 dirst = m.group(1) 0094 intrs[i] = intrs[i][:-len(dirst)] 0095 words[i] = dirst + words[i] 0096 0097 return words, intrs 0098 0099 0100 _mf_qt_rx = re.compile(r"^L?\d") 0101 0102 def _mod_on_format_qt (words, intrs): 0103 0104 for i in range(len(words)): 0105 if intrs[i].endswith("%") and _mf_qt_rx.search(words[i]): 0106 intrs[i] = intrs[i][:-1] 0107 words[i] = "%" + words[i] 0108 0109 return words, intrs 0110 0111 0112 # Regexes for text removals to get proper words. 0113 # Second member of the tuple is the replacement string. 0114 _r_url_rx = (re.compile(r"[a-zA-Z0-9.+-]+://[^\s]*" 0115 r"|www\.[\w.-]{1,250}" 0116 r"|\b[\w.-]+\.[a-z]{2,3}\b" 0117 , re.I|re.U), "") 0118 _r_email_rx = (re.compile(r"\b[\w.-]+@[\w.-]+", re.U), "") 0119 _r_shvar_rx = (re.compile(r"\$(\w+|\{.*?\})", re.U), "") 0120 _r_shopt_rx = (re.compile(r"(^|[^\w])(--|-|/)[\w-]+", re.U), "") 0121 _r_tags_rx = (re.compile(r"<.*?>"), " ") 0122 _r_ents_rx = (re.compile(r"&[\w.:-]+;", re.U), " ") 0123 _r_numents_rx = (re.compile(r"&#x?\d+;"), " ") 0124 _r_digits_rx = (re.compile(r"[\d⁰¹²³⁴⁵⁶⁷⁸⁹₀₁₂₃₄₅₆₇₈₉]+"), " ") 0125 0126 _r_fmtd_c_rx = (re.compile(r"(?<!%)%($\d+)?[+ ]?(\d+)?\.?(\d+)?[a-z]"), "", "c") 0127 _r_fmtd_qt_rx = (re.compile(r"%\d+"), "", "qt") 0128 _r_fmtd_python_rx = (re.compile(r"(?<!%)%\(\w+\)[a-z]"), "", "python") 0129 0130 _remove_xml_rxs = [ 0131 _r_tags_rx, # before entities 0132 _r_ents_rx, 0133 _r_numents_rx, 0134 ] 0135 _remove_rxs = [ 0136 _r_email_rx, # before URLs 0137 _r_url_rx, 0138 _r_shvar_rx, 0139 _r_shopt_rx, 0140 _r_digits_rx, 0141 ] 0142 _remove_fmtd_rxs = [ 0143 _r_fmtd_c_rx, 0144 _r_fmtd_qt_rx, 0145 _r_fmtd_python_rx, 0146 ] 0147 0148 # Pass words when: 0149 # - no underscores 0150 _word_ok_rx = re.compile(r"^[^_]*$", re.U) 0151 0152 0153 def proper_words (text, markup=False, accels=None, format=None): 0154 """ 0155 Mine proper words out of the text. 0156 0157 The proper words are those one would expect to find in a dictionary, 0158 or at least having that latent quality (jargon, etc.) 0159 As opposed to URLs, email addresses, shell variables, etc. 0160 0161 The text may contain XML-like markup (C{<...>} tags, entities...), 0162 or keyboard accelerator markers. 0163 It may also be of certain format known to Gettext (e.g. C{c-format}). 0164 If specified, these elements may influence splitting. 0165 0166 @param text: the text to split 0167 @type text: string 0168 0169 @param markup: whether text contains markup tags 0170 @type markup: bool 0171 0172 @param accels: accelerator characters to ignore 0173 @type accels: sequence 0174 0175 @param format: Gettext format flag 0176 @type format: None or string 0177 0178 @returns: proper words 0179 @rtype: list of strings 0180 """ 0181 0182 # Remove markup. 0183 # (before format directives) 0184 if markup: 0185 for rem_rx, sub in _remove_xml_rxs: 0186 text = rem_rx.sub(sub, text) 0187 0188 # Remove format directives. 0189 # (before general non-words) 0190 if format: 0191 for rem_rx, sub, clng in _remove_fmtd_rxs: 0192 if format.startswith(clng + "-"): 0193 text = rem_rx.sub(sub, text) 0194 0195 # Remove general known non-words. 0196 for rem_rx, sub in _remove_rxs: 0197 text = rem_rx.sub(sub, text) 0198 0199 # Remove accelerators (must come after other replacements). 0200 text = remove_accelerator(text, accels, greedy=True) 0201 0202 rwords = split_text(text)[0] 0203 words = [x for x in rwords if _word_ok_rx.search(x)] 0204 0205 return words