lang/es/remove_subs.py

0001 # -*- coding: utf-8 -*-
0002
0003 """
0004 Remove special substrings from parts of the message.
0005
0006 @author: Javier Viñal <fjvinal@gmail.com>
0007 @license: GPLv3
0008 """
0009
0010 import re
0011 #from pology import PologyError, datadir, _, n_
0012 #from pology.report import report, warning, format_item_list
0013
0014
0015 # Capitals words in valid contexts in the translated text according with Spanish grammar
0016 # (beggining of paragraph, after some punctuation characters and after a new line)
0017 _valid_capital_word_middle = re.compile("(?<=[.:?!>»\"]\s)\w*?[A-ZÁÉÍÓÚÜÑÇ]\w*", re.U)
0018 _valid_capital_word_initial = re.compile("^\w*?[A-ZÁÉÍÓÚÜÑÇ]\w*", re.U)
0019
0020 # All capital words in the original English text,
0021 _ent_capital_word = re.compile("\w*?[A-Z]\w*", re.U)
0022 # All plural full capital words (acronyms) without the final 's'.
0023 _ent_capital_word_plural = re.compile("[A-Z0-9]+(?=\'?s\b)", re.U)
0024
0025 def remove_paired_capital_words (msg, cat):
0026     """
0027     Remove all capital words from original text and from translated text, except that are located
0028     in a place where may be a capital word according the Spanish grammar.[type F4A hook].
0029
0030     @return: number of errors
0031     """
0032
0033     # Obtains capitals words in valid contexts in the translated text.
0034     for i in range(len(msg.msgstr)):
0035         ents = set()
0036         ents.update(_valid_capital_word_middle.findall(msg.msgstr[i]))
0037         ents.update(_valid_capital_word_initial.findall(msg.msgstr[i]))
0038         if i == 0:
0039             # Obtains all capitals words in the original English text.
0040             ents.update(_ent_capital_word.findall(msg.msgid))
0041             ents.update(_ent_capital_word_plural.findall(msg.msgid))
0042         else:
0043             if msg.msgid_plural:
0044                 ents.update(_ent_capital_word.findall(msg.msgid_plural))
0045                 ents.update(_ent_capital_word_plural.findall(msg.msgid_plural))
0046                 # Joins both set of words an remove them from the message.
0047         for ent in ents:
0048                         # report(_("@info", "Palabra en mayusculas: %(info)s \n", info=ent))
0049             msg.msgstr[i] = re.sub(r'\b' + ent + r'\b', '~', msg.msgstr[i], 0, re.U)
0050             if i == 0:
0051                 msg.msgid = re.sub(r'\b' + ent + r'\b', '~', msg.msgid, 0, re.U)
0052             else:
0053                 msg.msgid_plural = re.sub(r'\b' + ent + r'\b', '~', msg.msgid_plural, 0, re.U)
0054
0055     # The remainning words could have wrong capitalization in the translated message.
0056     # TODO: Look the remaining words in a Spanish dictionary.
0057
0058     return 0
0059
0060 def remove_original_capital_words (msg, cat):
0061     """
0062     Remove all capital words of the original text and from translated text.
0063     [type F4A hook].
0064
0065     @return: number of errors
0066     """
0067
0068     # Obtains capitals words in valid contexts in the translated text.
0069     for i in range(len(msg.msgstr)):
0070         ents = set()
0071         if i == 0:
0072             # Obtains all capitals words in the original English text.
0073             ents.update(_ent_capital_word.findall(msg.msgid))
0074             ents.update(_ent_capital_word_plural.findall(msg.msgid))
0075         else:
0076             if msg.msgid_plural:
0077                 ents.update(_ent_capital_word.findall(msg.msgid_plural))
0078                 ents.update(_ent_capital_word_plural.findall(msg.msgid_plural))
0079                 # Remove English capital words from translated text.
0080         for ent in ents:
0081             msg.msgstr[i] = re.sub(r'\b' + ent + r'\b', '~', msg.msgstr[i], 0, re.U)
0082
0083     return 0
0084
0085 _ent_parameter = re.compile("(%\d%?|\$\{.+?\}|\$\w+|%(?:\d\$)?[ds]|%\|.+?\|)", re.U)
0086
0087 def remove_paired_parameters (msg, cat):
0088     """
0089     Remove format strings from the original text, and from translation
0090     all that are also found in the original text [type F4A hook].
0091
0092     @return: number of errors
0093     """
0094
0095     pars_orig = set()
0096     pars_orig.update(_ent_parameter.findall(msg.msgid))
0097
0098     pars_orig_plural = set()
0099     if msg.msgid_plural:
0100         pars_orig_plural.update(_ent_parameter.findall(msg.msgid_plural))
0101
0102     for i in range(len(msg.msgstr)):
0103         pars_trans = set(_ent_parameter.findall(msg.msgstr[i]))
0104         if i == 0:
0105             for par in pars_trans.intersection(pars_orig):
0106                 msg.msgid = msg.msgid.replace(par, "~")
0107                 msg.msgstr[i] = msg.msgstr[i].replace(par, "~")
0108         else:
0109             for par in pars_trans.intersection(pars_orig_plural):
0110                 msg.msgid_plural = msg.msgid_plural.replace(par, "~")
0111                 msg.msgstr[i] = msg.msgstr[i].replace(par, "~")
0112
0113     return 0
0114
0115 _ent_xml_entity = re.compile("\<\/?\w+\>")
0116
0117 _auto_comment_tag = ("trans_comment", "literallayout", "option", "programlisting", "othercredit",
0118     "author", "email", "holder",
0119     "surname", "personname", "affiliation", "address", "sect1", "chapter", "chapterinfo", "date", "command", "option",
0120     "refentrytitle", "refentryinfo", "refname", "synopsis", "literal", "varname", "term", "glossterm",
0121     "filename", "entry", "envar", "userinput", "cmdsynopsis", "releaseinfo", "language", "Name",
0122     "City", "Region", "Region/state", "unit", "Query", "Kgm")
0123
0124 def remove_tags_without_translation (msg, cat):
0125     """
0126     Remove all paragraph that belong to contexts that do not
0127     have need of translation.
0128
0129     [type F4A hook].
0130     @return: number of errors
0131     """
0132
0133     if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"):
0134         msg.msgid = ""
0135         msg.msgid_plural = ""
0136         for i in range(len(msg.msgstr)):
0137             msg.msgstr[i] = ""
0138         return 0
0139
0140     # Avoid specially tagged messages.
0141     for tagline in msg.auto_comment:
0142         for tag in tagline.split():
0143             if tag in _auto_comment_tag:
0144                 msg.msgid = ""
0145                 if msg.msgid_plural:
0146                     msg.msgid_plural = ""
0147                 for i in range(len(msg.msgstr)):
0148                     msg.msgstr[i] = ""
0149                 return 0
0150
0151     if msg.msgctxt:
0152         for tag in msg.msgctxt.split():
0153             if tag in _auto_comment_tag:
0154                 msg.msgid = ""
0155                 if msg.msgid_plural:
0156                     msg.msgid_plural = ""
0157                 for i in range(len(msg.msgstr)):
0158                     msg.msgstr[i] = ""
0159                 return 0
0160
0161     return 0