File indexing completed on 2024-11-03 08:24:27

0001 #!/usr/bin/env python3
0002 # -*- coding: UTF-8 -*-
0003 
0004 """
0005 Perform machine translation of PO files.
0006 
0007 Documented in C{doc/user/lingo.docbook#sec-lgmtrans}.
0008 
0009 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net>
0010 @license: GPLv3
0011 """
0012 
0013 try:
0014     import fallback_import_paths
0015 except:
0016     pass
0017 
0018 import locale
0019 import subprocess
0020 import sys
0021 import os
0022 
0023 from pology import datadir, version, _, n_
0024 from pology.catalog import Catalog
0025 from pology.colors import ColorOptionParser
0026 import pology.config as pology_config
0027 from pology.entities import read_entities
0028 from pology.fsops import collect_catalogs, collect_system
0029 from pology.fsops import str_to_unicode
0030 from pology.fsops import exit_on_exception
0031 from pology.message import MessageUnsafe
0032 from pology.remove import remove_accel_msg
0033 from pology.report import report, error, warning
0034 from pology.resolve import resolve_entities_simple
0035 
0036 
0037 def main ():
0038 
0039     locale.setlocale(locale.LC_ALL, "")
0040 
0041     # Get defaults for command line options from global config.
0042     cfgsec = pology_config.section("pomtrans")
0043 
0044     showservs = list()
0045     showservs.sort()
0046 
0047     # Setup options and parse the command line.
0048     usage = _("@info command usage",
0049         "%(cmd)s [OPTIONS] TRANSERV PATHS...",
0050         cmd="%prog")
0051     desc = _("@info command description",
0052         "Perform machine translation of PO files.")
0053     ver = _("@info command version",
0054         "%(cmd)s (Pology) %(version)s\n"
0055         "Copyright © 2009, 2010 "
0056         "Chusslove Illich (Часлав Илић) &lt;%(email)s&gt;",
0057         cmd="%prog", version=version(), email="caslav.ilic@gmx.net")
0058 
0059     opars = ColorOptionParser(usage=usage, description=desc, version=ver)
0060     opars.add_option(
0061         "-a", "--accelerator", dest="accel",
0062         metavar=_("@info command line value placeholder", "CHAR"),
0063         help=_("@info command line option description",
0064                "Accelerator marker character used in messages. "
0065                "Detected from catalogs if not given."))
0066     opars.add_option(
0067         "-c", "--parallel-compendium", dest="parcomp",
0068         metavar=_("@info command line value placeholder", "FILE"),
0069         help=_("@info command line option description",
0070                "Translate from translation to another language, "
0071                "found in compendium file at the given path."))
0072     opars.add_option(
0073         "-l", "--list-transervs",
0074         action="store_true", dest="list_transervs", default=False,
0075         help="List available translation services.")
0076     opars.add_option(
0077         "-m", "--flag-%s" % _flag_mtrans,
0078         action="store_true", dest="flag_mtrans", default=False,
0079         help=_("@info command line option description",
0080               "Add '%(flag)s' flag to translated messages.",
0081               flag=_flag_mtrans))
0082     opars.add_option(
0083         "-M", "--translation-mode", dest="tmode",
0084         metavar=_("@info command line value placeholder", "MODE"),
0085         help=_("@info command line option description",
0086                "Translation mode for the chosen translation service. "
0087                "Overrides the default translation mode constructed "
0088                "based on source and target language. "
0089                "Mode string format is translation service dependent."))
0090     opars.add_option(
0091         "-n", "--no-fuzzy-flag",
0092         action="store_false", dest="flag_fuzzy", default=True,
0093         help=_("@info command line option description",
0094                "Do not add '%(flag)s' flag to translated messages.",
0095                flag="fuzzy"))
0096     opars.add_option(
0097         "-p", "--parallel-catalogs", dest="parcats",
0098         metavar=_("@info command line value placeholder", "SEARCH:REPLACE"),
0099         help=_("@info command line option description",
0100                "Translate from translation to another language "
0101                "found in parallel catalogs. "
0102                "For given target catalog path, the path to parallel catalog "
0103                "is constructed by replacing once SEARCH with REPLACE."))
0104     opars.add_option(
0105         "-s", "--source-lang", dest="slang",
0106         metavar=_("@info command line value placeholder", "LANG"),
0107         help=_("@info command line option description",
0108                "Source language code. "
0109                "Detected from catalogs if not given."))
0110     opars.add_option(
0111         "-t", "--target-lang", dest="tlang",
0112         metavar=_("@info command line value placeholder", "LANG"),
0113         help=_("@info command line option description",
0114                "Target language code. "
0115                "Detected from catalogs if not given."))
0116     opars.add_option(
0117         "-T", "--transerv-bin", dest="transerv_bin",
0118         metavar=_("@info command line value placeholder", "PATH"),
0119         help=_("@info command line option description",
0120                "Custom path to translation service executable "
0121                "(where applicable)."))
0122     opars.add_option(
0123         "-d", "--data-directory", dest="data_directory",
0124         metavar=_("@info command line value placeholder", "FOLDER"),
0125         help=_("@info command line option description",
0126                "Custom path to a translation data directory (where applicable)."))
0127 
0128     (op, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:]))
0129 
0130     # Could use some speedup.
0131     try:
0132         import psyco
0133         psyco.full()
0134     except ImportError:
0135         pass
0136 
0137     if op.list_transervs:
0138         report("\n".join(sorted(_known_transervs.keys())))
0139         sys.exit(0)
0140 
0141     if len(free_args) < 1:
0142         error(_("@info",
0143                 "Translation service not specified."))
0144     transervkey = free_args.pop(0)
0145     if transervkey not in _known_transervs:
0146         error(_("@info",
0147                 "Translation service '%(serv)s' not known.",
0148                 serv=transervkey))
0149 
0150     tsbuilder_wopts = _known_transervs[transervkey]
0151     tsbuilder = lambda slang, tlang: tsbuilder_wopts(slang, tlang, op)
0152 
0153     paths = free_args
0154     if not op.parcomp and not op.parcats:
0155         translate_direct(paths, tsbuilder, op)
0156     else:
0157         translate_parallel(paths, tsbuilder, op)
0158 
0159 
0160 def translate_direct (paths, tsbuilder, options):
0161 
0162     transervs = {}
0163 
0164     catpaths = collect_catalogs(paths)
0165     for catpath in catpaths:
0166 
0167         # Collect messages and texts to translate.
0168         cat = Catalog(catpath)
0169         if options.accel is not None: # force explicitly given accelerator
0170             cat.set_accelerator(options.accel)
0171         texts = []
0172         msgs = []
0173         for msg in cat:
0174             if to_translate(msg, options):
0175                 msgf = MessageUnsafe(msg)
0176                 remove_accel_msg(msgf, cat)
0177                 texts.append(msgf.msgid)
0178                 if msg.msgid_plural is not None:
0179                     texts.append(msgf.msgid_plural)
0180                 msgs.append(msg)
0181 
0182         # Translate collected texts.
0183         slang = options.slang or "en"
0184         transerv = get_transerv(slang, options.tlang, cat, cat, tsbuilder)
0185         texts_tr = transerv.translate(texts) if texts else []
0186         if texts_tr is None:
0187             warning(_("@info",
0188                       "Translation service failure on '%(file)s'.",
0189                       file=catpath))
0190             continue
0191         for i, text in enumerate(texts_tr):
0192             text = reduce_for_encoding(text, cat.encoding())
0193             texts_tr[i] = text
0194 
0195         # Put translated texts into messages.
0196         singlepls = cat.plural_indices_single()
0197         for msg in msgs:
0198             msgid_tr = texts_tr.pop(0)
0199             if msg.msgid_plural is not None:
0200                 msgid_plural_tr = texts_tr.pop(0)
0201             if msgid_tr:
0202                 if msg.msgid_plural is not None:
0203                     for i in range(len(msg.msgstr)):
0204                         if i in singlepls:
0205                             msg.msgstr[i] = msgid_tr
0206                         else:
0207                             msg.msgstr[i] = msgid_plural_tr
0208                 else:
0209                     msg.msgstr[0] = msgid_tr
0210                 decorate(msg, options)
0211 
0212         sync_rep(cat, msgs)
0213 
0214 
0215 def translate_parallel (paths, tsbuilder, options):
0216 
0217     pathrepl = options.parcats
0218     comppath = options.parcomp
0219     slang = options.slang
0220     tlang = options.tlang
0221 
0222     ccat = None
0223     if comppath is not None:
0224         if not os.path.isfile(comppath):
0225             error(_("@info",
0226                     "Compendium '%(file)s' does not exist.",
0227                     file=comppath))
0228         ccat = Catalog(comppath, monitored=False)
0229 
0230     if pathrepl is not None:
0231         lst = pathrepl.split(":")
0232         if len(lst) != 2:
0233             error(_("@info",
0234                     "Invalid search and replace specification '%(spec)s'.",
0235                     spec=pathrepl))
0236         pathsrch, pathrepl = lst
0237 
0238     catpaths = collect_catalogs(paths)
0239     for catpath in catpaths:
0240 
0241         # Open parallel catalog if it exists.
0242         pcat = None
0243         if pathrepl is not None:
0244             pcatpath = catpath.replace(pathsrch, pathrepl, 1)
0245             if catpath == pcatpath:
0246                 error(_("@info",
0247                         "Parallel catalog and target catalog are same files "
0248                         "for '%(file)s'.",
0249                         file=catpath))
0250             if os.path.isfile(pcatpath):
0251                 pcat = Catalog(pcatpath, monitored=False)
0252 
0253         # If there is neither the parallel catalog nor the compendium,
0254         # skip processing current target catalog.
0255         if not pcat and not ccat:
0256             continue
0257 
0258         # Collect messages and texts to translate.
0259         cat = Catalog(catpath)
0260         pmsgs, psmsgs, ptexts = [], [], []
0261         cmsgs, csmsgs, ctexts = [], [], []
0262         for msg in cat:
0263             if to_translate(msg, options):
0264                 # Priority: parallel catalog, then compendium.
0265                 for scat, msgs, smsgs, texts in (
0266                     (pcat, pmsgs, psmsgs, ptexts),
0267                     (ccat, cmsgs, csmsgs, ctexts),
0268                 ):
0269                     if scat and msg in scat:
0270                         smsg = scat[msg]
0271                         if smsg.translated:
0272                             msgs.append(msg)
0273                             smsgs.append(smsg)
0274                             texts.extend(smsg.msgstr)
0275                             break
0276 
0277         # Translate collected texts.
0278         texts_tr = []
0279         for texts, scat in ((ptexts, pcat), (ctexts, ccat)):
0280             transerv = get_transerv(slang, tlang, scat, cat, tsbuilder)
0281             texts_tr.append(transerv.translate(texts) if texts else [])
0282             if texts_tr[-1] is None:
0283                 texts_tr = None
0284                 break
0285         if texts_tr is None:
0286             warning(_("@info",
0287                       "Translation service failure on '%(file)s'.",
0288                       file=catpath))
0289             continue
0290         ptexts_tr, ctexts_tr = texts_tr
0291 
0292         # Put translated texts into messages.
0293         # For plural messages, assume 1-1 match to parallel language.
0294         for msgs, smsgs, texts in (
0295             (pmsgs, psmsgs, ptexts_tr),
0296             (cmsgs, csmsgs, ctexts_tr),
0297         ):
0298             for msg, smsg in zip(msgs, smsgs):
0299                 ctexts = []
0300                 for i in range(len(smsg.msgstr)):
0301                     text = texts.pop(0)
0302                     text = reduce_for_encoding(text, cat.encoding())
0303                     ctexts.append(text)
0304                 for i in range(len(msg.msgstr)):
0305                     msg.msgstr[i] = i < len(ctexts) and ctexts[i] or ctexts[-1]
0306                     decorate(msg, options)
0307 
0308         sync_rep(cat, pmsgs + cmsgs)
0309 
0310 
0311 def to_translate (msg, options):
0312 
0313     return msg.untranslated
0314 
0315 
0316 _flag_mtrans = "mtrans"
0317 
0318 def decorate (msg, options):
0319 
0320     msg.unfuzzy() # clear any previous fuzzy stuff
0321     if options.flag_fuzzy:
0322         msg.fuzzy = True
0323     if options.flag_mtrans:
0324         msg.flag.add(_flag_mtrans)
0325 
0326 
0327 # Cache of translation services by (source, target) language pair.
0328 _transervs = {}
0329 
0330 # Return translation service for (slang, tlang) pair.
0331 # If the service was not created yet, create it and cache it.
0332 # If slang or tlang are None, use target language of corresponding catalog.
0333 def get_transerv (slang, tlang, scat, tcat, tsbuilder):
0334 
0335     if not slang:
0336         slang = scat.header.get_field_value("Language")
0337         if not slang:
0338             error(_("@info",
0339                     "Cannot determine language of source catalog '%(file)s'.",
0340                     file=scat.filename))
0341     if not tlang:
0342         tlang = tcat.header.get_field_value("Language")
0343         if not tlang:
0344             error(_("@info",
0345                     "Cannot determine language of target catalog '%(file)s'.",
0346                     file=tcat.filename))
0347 
0348     trdir = (slang, tlang)
0349     if trdir not in _transervs:
0350         _transervs[trdir] = tsbuilder(slang, tlang)
0351 
0352     return _transervs[trdir]
0353 
0354 
0355 def sync_rep (cat, mmsgs):
0356 
0357     if cat.sync():
0358         report("! %s (%s)" % (cat.filename, len(mmsgs)))
0359 
0360 
0361 def reduce_for_encoding (text, enc):
0362 
0363     while True:
0364         try:
0365             text.encode(enc)
0366         except UnicodeEncodeError as e:
0367             start, end = e[2], e[3]
0368             text = text[:start] + ("?" * (end - start)) + text[end:]
0369         finally:
0370             break
0371     return text
0372 
0373 
0374 # ----------------------------------------
0375 # Apertium -- a free/open-source machine translation platform
0376 # http://www.apertium.org/
0377 
0378 class Translator_apertium (object):
0379 
0380     def __init__ (self, slang, tlang, options):
0381 
0382         cmdpath = options.transerv_bin or "apertium"
0383         try:
0384             subprocess.call(cmdpath,
0385                             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
0386         except OSError:
0387             error(_("@info Apertium is machine translation software",
0388                     "Apertium executable not found at '%(path)s'.",
0389                     path=cmdpath))
0390 
0391         if options.tmode is not None:
0392             mode = options.tmode
0393         else:
0394             mode = "%s-%s" % (slang, tlang)
0395 
0396         optional_parameters = ""
0397         if options.data_directory:
0398             optional_parameters = "-d %s" % options.data_directory
0399 
0400         self.cmdline = "%s -u -f html-noent %s %s" % (
0401             cmdpath, optional_parameters, mode)
0402 
0403         entpath = os.path.join(datadir(), "spec", "html.entities")
0404         self.htmlents = read_entities(entpath)
0405 
0406 
0407     def translate (self, texts):
0408 
0409         # Serialize texts to send to Apertium in one go.
0410         # Separate texts with an inplace tag followed by dot,
0411         # to have each text interpreted as standalone sentence.
0412         # FIXME: Any way to really translate each text in turn,
0413         # without it being horribly slow?
0414         tag = "<br>"
0415         sep = None
0416         nsep = 1
0417         while not sep: # determine shortest acceptable separator
0418             sep = tag * nsep + "."
0419             for text in texts:
0420                 if sep in text:
0421                     sep = None
0422                     nsep += 1
0423                     break
0424         stext = sep.join(texts)
0425 
0426         # Translate empty string to test language pair.
0427         # Otherwise, if a lot of text is sent and language pair not good,
0428         # Apertium may just signal broken pipe.
0429         res = collect_system(self.cmdline, instr="")
0430         if res[2] != 0:
0431             warning(_("@info",
0432                       "Executing Apertium failed:\n%(output)s",
0433                       output=res[0]))
0434             # ...really res[0], error is output to stdout. Tsk.
0435             return None
0436 
0437         res = collect_system(self.cmdline, instr=stext)
0438         if res[2] != 0:
0439             warning(_("@info",
0440                       "Executing Apertium failed:\n%(output)s",
0441                       output=res[0]))
0442             # ...really res[0], error is output to stdout. Tsk.
0443             return None
0444 
0445         texts_tr = res[0].split(sep)
0446         if len(texts_tr) != len(texts):
0447             warning(_("@info",
0448                       "Apertium reported wrong number of translations, "
0449                       "%(num1)d instead of %(num2)d.",
0450                       num1=len(texts_tr), num2=len(texts)))
0451             return None
0452 
0453         texts_tr = [resolve_entities_simple(x, self.htmlents) for x in texts_tr]
0454 
0455         return texts_tr
0456 
0457 
0458 # ----------------------------------------
0459 # Google Translate
0460 # http://translate.google.com
0461 
0462 # Communication code derived from py-gtranslate library
0463 # http://code.google.com/p/py-gtranslate/
0464 
0465 # Updated for v2.0 API by Víctor R. Rodríguez Domínguez
0466 # http://vrdominguez.es
0467 
0468 
0469 class Translator_google (object):
0470 
0471     def __init__ (self, slang, tlang, options):
0472 
0473         if options.tmode is not None:
0474             ( self.lang_in, self.lang_out ) = options.tmode.split('|')
0475         else:
0476             self.lang_in = slang
0477             self.lang_out = tlang
0478         
0479         self.apikey = pology_config.section("pomtrans").string("google-api-key")
0480 
0481 
0482     def translate (self, texts):
0483 
0484         import urllib.request, urllib.parse, urllib.error
0485         try:
0486             import simplejson
0487         except:
0488             error(_("@info",
0489                     "Python module '%(mod)s' not available. "
0490                     "Try installing the '%(pkg)s' package.",
0491                     mod="simplejson", pkg="python-simplejson"))
0492 
0493         baseurl = "https://www.googleapis.com/language/translate/v2"
0494         baseparams = (("key", self.apikey), ("source", self.lang_in),
0495                       ("target", self.lang_out), ("target","json"))
0496 
0497         texts_tr = []
0498         for text in texts:
0499             params = baseparams + (("q", text.encode("utf8")),)
0500             parfmt = "&".join(["%s=%s" % (p, urllib.parse.quote_plus(v))
0501                                for p, v in params])
0502             execurl = "%s?%s" % (baseurl, parfmt)
0503             try:
0504                 res = simplejson.load(urllib.request.FancyURLopener().open(execurl))
0505                 text_tr = str(res["data"]["translations"][0]["translatedText"])
0506             except:
0507                 text_tr = ""
0508             texts_tr.append(text_tr)
0509 
0510         return texts_tr
0511 
0512 
0513 # ----------------------------------------
0514 
0515 # Collect defined translation services by name.
0516 _known_transervs = {}
0517 def _init ():
0518     tspref = "Translator_"
0519     for locvar, locval in list(globals().items()):
0520         if locvar.startswith(tspref):
0521             _known_transervs[locvar[len(tspref):]] = locval
0522 _init()
0523 
0524 
0525 if __name__ == '__main__':
0526     exit_on_exception(main)