File indexing completed on 2024-11-03 08:24:27
0001 #!/usr/bin/env python3 0002 # -*- coding: UTF-8 -*- 0003 0004 """ 0005 Perform machine translation of PO files. 0006 0007 Documented in C{doc/user/lingo.docbook#sec-lgmtrans}. 0008 0009 @author: Chusslove Illich (Часлав Илић) <caslav.ilic@gmx.net> 0010 @license: GPLv3 0011 """ 0012 0013 try: 0014 import fallback_import_paths 0015 except: 0016 pass 0017 0018 import locale 0019 import subprocess 0020 import sys 0021 import os 0022 0023 from pology import datadir, version, _, n_ 0024 from pology.catalog import Catalog 0025 from pology.colors import ColorOptionParser 0026 import pology.config as pology_config 0027 from pology.entities import read_entities 0028 from pology.fsops import collect_catalogs, collect_system 0029 from pology.fsops import str_to_unicode 0030 from pology.fsops import exit_on_exception 0031 from pology.message import MessageUnsafe 0032 from pology.remove import remove_accel_msg 0033 from pology.report import report, error, warning 0034 from pology.resolve import resolve_entities_simple 0035 0036 0037 def main (): 0038 0039 locale.setlocale(locale.LC_ALL, "") 0040 0041 # Get defaults for command line options from global config. 0042 cfgsec = pology_config.section("pomtrans") 0043 0044 showservs = list() 0045 showservs.sort() 0046 0047 # Setup options and parse the command line. 0048 usage = _("@info command usage", 0049 "%(cmd)s [OPTIONS] TRANSERV PATHS...", 0050 cmd="%prog") 0051 desc = _("@info command description", 0052 "Perform machine translation of PO files.") 0053 ver = _("@info command version", 0054 "%(cmd)s (Pology) %(version)s\n" 0055 "Copyright © 2009, 2010 " 0056 "Chusslove Illich (Часлав Илић) <%(email)s>", 0057 cmd="%prog", version=version(), email="caslav.ilic@gmx.net") 0058 0059 opars = ColorOptionParser(usage=usage, description=desc, version=ver) 0060 opars.add_option( 0061 "-a", "--accelerator", dest="accel", 0062 metavar=_("@info command line value placeholder", "CHAR"), 0063 help=_("@info command line option description", 0064 "Accelerator marker character used in messages. " 0065 "Detected from catalogs if not given.")) 0066 opars.add_option( 0067 "-c", "--parallel-compendium", dest="parcomp", 0068 metavar=_("@info command line value placeholder", "FILE"), 0069 help=_("@info command line option description", 0070 "Translate from translation to another language, " 0071 "found in compendium file at the given path.")) 0072 opars.add_option( 0073 "-l", "--list-transervs", 0074 action="store_true", dest="list_transervs", default=False, 0075 help="List available translation services.") 0076 opars.add_option( 0077 "-m", "--flag-%s" % _flag_mtrans, 0078 action="store_true", dest="flag_mtrans", default=False, 0079 help=_("@info command line option description", 0080 "Add '%(flag)s' flag to translated messages.", 0081 flag=_flag_mtrans)) 0082 opars.add_option( 0083 "-M", "--translation-mode", dest="tmode", 0084 metavar=_("@info command line value placeholder", "MODE"), 0085 help=_("@info command line option description", 0086 "Translation mode for the chosen translation service. " 0087 "Overrides the default translation mode constructed " 0088 "based on source and target language. " 0089 "Mode string format is translation service dependent.")) 0090 opars.add_option( 0091 "-n", "--no-fuzzy-flag", 0092 action="store_false", dest="flag_fuzzy", default=True, 0093 help=_("@info command line option description", 0094 "Do not add '%(flag)s' flag to translated messages.", 0095 flag="fuzzy")) 0096 opars.add_option( 0097 "-p", "--parallel-catalogs", dest="parcats", 0098 metavar=_("@info command line value placeholder", "SEARCH:REPLACE"), 0099 help=_("@info command line option description", 0100 "Translate from translation to another language " 0101 "found in parallel catalogs. " 0102 "For given target catalog path, the path to parallel catalog " 0103 "is constructed by replacing once SEARCH with REPLACE.")) 0104 opars.add_option( 0105 "-s", "--source-lang", dest="slang", 0106 metavar=_("@info command line value placeholder", "LANG"), 0107 help=_("@info command line option description", 0108 "Source language code. " 0109 "Detected from catalogs if not given.")) 0110 opars.add_option( 0111 "-t", "--target-lang", dest="tlang", 0112 metavar=_("@info command line value placeholder", "LANG"), 0113 help=_("@info command line option description", 0114 "Target language code. " 0115 "Detected from catalogs if not given.")) 0116 opars.add_option( 0117 "-T", "--transerv-bin", dest="transerv_bin", 0118 metavar=_("@info command line value placeholder", "PATH"), 0119 help=_("@info command line option description", 0120 "Custom path to translation service executable " 0121 "(where applicable).")) 0122 opars.add_option( 0123 "-d", "--data-directory", dest="data_directory", 0124 metavar=_("@info command line value placeholder", "FOLDER"), 0125 help=_("@info command line option description", 0126 "Custom path to a translation data directory (where applicable).")) 0127 0128 (op, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) 0129 0130 # Could use some speedup. 0131 try: 0132 import psyco 0133 psyco.full() 0134 except ImportError: 0135 pass 0136 0137 if op.list_transervs: 0138 report("\n".join(sorted(_known_transervs.keys()))) 0139 sys.exit(0) 0140 0141 if len(free_args) < 1: 0142 error(_("@info", 0143 "Translation service not specified.")) 0144 transervkey = free_args.pop(0) 0145 if transervkey not in _known_transervs: 0146 error(_("@info", 0147 "Translation service '%(serv)s' not known.", 0148 serv=transervkey)) 0149 0150 tsbuilder_wopts = _known_transervs[transervkey] 0151 tsbuilder = lambda slang, tlang: tsbuilder_wopts(slang, tlang, op) 0152 0153 paths = free_args 0154 if not op.parcomp and not op.parcats: 0155 translate_direct(paths, tsbuilder, op) 0156 else: 0157 translate_parallel(paths, tsbuilder, op) 0158 0159 0160 def translate_direct (paths, tsbuilder, options): 0161 0162 transervs = {} 0163 0164 catpaths = collect_catalogs(paths) 0165 for catpath in catpaths: 0166 0167 # Collect messages and texts to translate. 0168 cat = Catalog(catpath) 0169 if options.accel is not None: # force explicitly given accelerator 0170 cat.set_accelerator(options.accel) 0171 texts = [] 0172 msgs = [] 0173 for msg in cat: 0174 if to_translate(msg, options): 0175 msgf = MessageUnsafe(msg) 0176 remove_accel_msg(msgf, cat) 0177 texts.append(msgf.msgid) 0178 if msg.msgid_plural is not None: 0179 texts.append(msgf.msgid_plural) 0180 msgs.append(msg) 0181 0182 # Translate collected texts. 0183 slang = options.slang or "en" 0184 transerv = get_transerv(slang, options.tlang, cat, cat, tsbuilder) 0185 texts_tr = transerv.translate(texts) if texts else [] 0186 if texts_tr is None: 0187 warning(_("@info", 0188 "Translation service failure on '%(file)s'.", 0189 file=catpath)) 0190 continue 0191 for i, text in enumerate(texts_tr): 0192 text = reduce_for_encoding(text, cat.encoding()) 0193 texts_tr[i] = text 0194 0195 # Put translated texts into messages. 0196 singlepls = cat.plural_indices_single() 0197 for msg in msgs: 0198 msgid_tr = texts_tr.pop(0) 0199 if msg.msgid_plural is not None: 0200 msgid_plural_tr = texts_tr.pop(0) 0201 if msgid_tr: 0202 if msg.msgid_plural is not None: 0203 for i in range(len(msg.msgstr)): 0204 if i in singlepls: 0205 msg.msgstr[i] = msgid_tr 0206 else: 0207 msg.msgstr[i] = msgid_plural_tr 0208 else: 0209 msg.msgstr[0] = msgid_tr 0210 decorate(msg, options) 0211 0212 sync_rep(cat, msgs) 0213 0214 0215 def translate_parallel (paths, tsbuilder, options): 0216 0217 pathrepl = options.parcats 0218 comppath = options.parcomp 0219 slang = options.slang 0220 tlang = options.tlang 0221 0222 ccat = None 0223 if comppath is not None: 0224 if not os.path.isfile(comppath): 0225 error(_("@info", 0226 "Compendium '%(file)s' does not exist.", 0227 file=comppath)) 0228 ccat = Catalog(comppath, monitored=False) 0229 0230 if pathrepl is not None: 0231 lst = pathrepl.split(":") 0232 if len(lst) != 2: 0233 error(_("@info", 0234 "Invalid search and replace specification '%(spec)s'.", 0235 spec=pathrepl)) 0236 pathsrch, pathrepl = lst 0237 0238 catpaths = collect_catalogs(paths) 0239 for catpath in catpaths: 0240 0241 # Open parallel catalog if it exists. 0242 pcat = None 0243 if pathrepl is not None: 0244 pcatpath = catpath.replace(pathsrch, pathrepl, 1) 0245 if catpath == pcatpath: 0246 error(_("@info", 0247 "Parallel catalog and target catalog are same files " 0248 "for '%(file)s'.", 0249 file=catpath)) 0250 if os.path.isfile(pcatpath): 0251 pcat = Catalog(pcatpath, monitored=False) 0252 0253 # If there is neither the parallel catalog nor the compendium, 0254 # skip processing current target catalog. 0255 if not pcat and not ccat: 0256 continue 0257 0258 # Collect messages and texts to translate. 0259 cat = Catalog(catpath) 0260 pmsgs, psmsgs, ptexts = [], [], [] 0261 cmsgs, csmsgs, ctexts = [], [], [] 0262 for msg in cat: 0263 if to_translate(msg, options): 0264 # Priority: parallel catalog, then compendium. 0265 for scat, msgs, smsgs, texts in ( 0266 (pcat, pmsgs, psmsgs, ptexts), 0267 (ccat, cmsgs, csmsgs, ctexts), 0268 ): 0269 if scat and msg in scat: 0270 smsg = scat[msg] 0271 if smsg.translated: 0272 msgs.append(msg) 0273 smsgs.append(smsg) 0274 texts.extend(smsg.msgstr) 0275 break 0276 0277 # Translate collected texts. 0278 texts_tr = [] 0279 for texts, scat in ((ptexts, pcat), (ctexts, ccat)): 0280 transerv = get_transerv(slang, tlang, scat, cat, tsbuilder) 0281 texts_tr.append(transerv.translate(texts) if texts else []) 0282 if texts_tr[-1] is None: 0283 texts_tr = None 0284 break 0285 if texts_tr is None: 0286 warning(_("@info", 0287 "Translation service failure on '%(file)s'.", 0288 file=catpath)) 0289 continue 0290 ptexts_tr, ctexts_tr = texts_tr 0291 0292 # Put translated texts into messages. 0293 # For plural messages, assume 1-1 match to parallel language. 0294 for msgs, smsgs, texts in ( 0295 (pmsgs, psmsgs, ptexts_tr), 0296 (cmsgs, csmsgs, ctexts_tr), 0297 ): 0298 for msg, smsg in zip(msgs, smsgs): 0299 ctexts = [] 0300 for i in range(len(smsg.msgstr)): 0301 text = texts.pop(0) 0302 text = reduce_for_encoding(text, cat.encoding()) 0303 ctexts.append(text) 0304 for i in range(len(msg.msgstr)): 0305 msg.msgstr[i] = i < len(ctexts) and ctexts[i] or ctexts[-1] 0306 decorate(msg, options) 0307 0308 sync_rep(cat, pmsgs + cmsgs) 0309 0310 0311 def to_translate (msg, options): 0312 0313 return msg.untranslated 0314 0315 0316 _flag_mtrans = "mtrans" 0317 0318 def decorate (msg, options): 0319 0320 msg.unfuzzy() # clear any previous fuzzy stuff 0321 if options.flag_fuzzy: 0322 msg.fuzzy = True 0323 if options.flag_mtrans: 0324 msg.flag.add(_flag_mtrans) 0325 0326 0327 # Cache of translation services by (source, target) language pair. 0328 _transervs = {} 0329 0330 # Return translation service for (slang, tlang) pair. 0331 # If the service was not created yet, create it and cache it. 0332 # If slang or tlang are None, use target language of corresponding catalog. 0333 def get_transerv (slang, tlang, scat, tcat, tsbuilder): 0334 0335 if not slang: 0336 slang = scat.header.get_field_value("Language") 0337 if not slang: 0338 error(_("@info", 0339 "Cannot determine language of source catalog '%(file)s'.", 0340 file=scat.filename)) 0341 if not tlang: 0342 tlang = tcat.header.get_field_value("Language") 0343 if not tlang: 0344 error(_("@info", 0345 "Cannot determine language of target catalog '%(file)s'.", 0346 file=tcat.filename)) 0347 0348 trdir = (slang, tlang) 0349 if trdir not in _transervs: 0350 _transervs[trdir] = tsbuilder(slang, tlang) 0351 0352 return _transervs[trdir] 0353 0354 0355 def sync_rep (cat, mmsgs): 0356 0357 if cat.sync(): 0358 report("! %s (%s)" % (cat.filename, len(mmsgs))) 0359 0360 0361 def reduce_for_encoding (text, enc): 0362 0363 while True: 0364 try: 0365 text.encode(enc) 0366 except UnicodeEncodeError as e: 0367 start, end = e[2], e[3] 0368 text = text[:start] + ("?" * (end - start)) + text[end:] 0369 finally: 0370 break 0371 return text 0372 0373 0374 # ---------------------------------------- 0375 # Apertium -- a free/open-source machine translation platform 0376 # http://www.apertium.org/ 0377 0378 class Translator_apertium (object): 0379 0380 def __init__ (self, slang, tlang, options): 0381 0382 cmdpath = options.transerv_bin or "apertium" 0383 try: 0384 subprocess.call(cmdpath, 0385 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 0386 except OSError: 0387 error(_("@info Apertium is machine translation software", 0388 "Apertium executable not found at '%(path)s'.", 0389 path=cmdpath)) 0390 0391 if options.tmode is not None: 0392 mode = options.tmode 0393 else: 0394 mode = "%s-%s" % (slang, tlang) 0395 0396 optional_parameters = "" 0397 if options.data_directory: 0398 optional_parameters = "-d %s" % options.data_directory 0399 0400 self.cmdline = "%s -u -f html-noent %s %s" % ( 0401 cmdpath, optional_parameters, mode) 0402 0403 entpath = os.path.join(datadir(), "spec", "html.entities") 0404 self.htmlents = read_entities(entpath) 0405 0406 0407 def translate (self, texts): 0408 0409 # Serialize texts to send to Apertium in one go. 0410 # Separate texts with an inplace tag followed by dot, 0411 # to have each text interpreted as standalone sentence. 0412 # FIXME: Any way to really translate each text in turn, 0413 # without it being horribly slow? 0414 tag = "<br>" 0415 sep = None 0416 nsep = 1 0417 while not sep: # determine shortest acceptable separator 0418 sep = tag * nsep + "." 0419 for text in texts: 0420 if sep in text: 0421 sep = None 0422 nsep += 1 0423 break 0424 stext = sep.join(texts) 0425 0426 # Translate empty string to test language pair. 0427 # Otherwise, if a lot of text is sent and language pair not good, 0428 # Apertium may just signal broken pipe. 0429 res = collect_system(self.cmdline, instr="") 0430 if res[2] != 0: 0431 warning(_("@info", 0432 "Executing Apertium failed:\n%(output)s", 0433 output=res[0])) 0434 # ...really res[0], error is output to stdout. Tsk. 0435 return None 0436 0437 res = collect_system(self.cmdline, instr=stext) 0438 if res[2] != 0: 0439 warning(_("@info", 0440 "Executing Apertium failed:\n%(output)s", 0441 output=res[0])) 0442 # ...really res[0], error is output to stdout. Tsk. 0443 return None 0444 0445 texts_tr = res[0].split(sep) 0446 if len(texts_tr) != len(texts): 0447 warning(_("@info", 0448 "Apertium reported wrong number of translations, " 0449 "%(num1)d instead of %(num2)d.", 0450 num1=len(texts_tr), num2=len(texts))) 0451 return None 0452 0453 texts_tr = [resolve_entities_simple(x, self.htmlents) for x in texts_tr] 0454 0455 return texts_tr 0456 0457 0458 # ---------------------------------------- 0459 # Google Translate 0460 # http://translate.google.com 0461 0462 # Communication code derived from py-gtranslate library 0463 # http://code.google.com/p/py-gtranslate/ 0464 0465 # Updated for v2.0 API by Víctor R. Rodríguez Domínguez 0466 # http://vrdominguez.es 0467 0468 0469 class Translator_google (object): 0470 0471 def __init__ (self, slang, tlang, options): 0472 0473 if options.tmode is not None: 0474 ( self.lang_in, self.lang_out ) = options.tmode.split('|') 0475 else: 0476 self.lang_in = slang 0477 self.lang_out = tlang 0478 0479 self.apikey = pology_config.section("pomtrans").string("google-api-key") 0480 0481 0482 def translate (self, texts): 0483 0484 import urllib.request, urllib.parse, urllib.error 0485 try: 0486 import simplejson 0487 except: 0488 error(_("@info", 0489 "Python module '%(mod)s' not available. " 0490 "Try installing the '%(pkg)s' package.", 0491 mod="simplejson", pkg="python-simplejson")) 0492 0493 baseurl = "https://www.googleapis.com/language/translate/v2" 0494 baseparams = (("key", self.apikey), ("source", self.lang_in), 0495 ("target", self.lang_out), ("target","json")) 0496 0497 texts_tr = [] 0498 for text in texts: 0499 params = baseparams + (("q", text.encode("utf8")),) 0500 parfmt = "&".join(["%s=%s" % (p, urllib.parse.quote_plus(v)) 0501 for p, v in params]) 0502 execurl = "%s?%s" % (baseurl, parfmt) 0503 try: 0504 res = simplejson.load(urllib.request.FancyURLopener().open(execurl)) 0505 text_tr = str(res["data"]["translations"][0]["translatedText"]) 0506 except: 0507 text_tr = "" 0508 texts_tr.append(text_tr) 0509 0510 return texts_tr 0511 0512 0513 # ---------------------------------------- 0514 0515 # Collect defined translation services by name. 0516 _known_transervs = {} 0517 def _init (): 0518 tspref = "Translator_" 0519 for locvar, locval in list(globals().items()): 0520 if locvar.startswith(tspref): 0521 _known_transervs[locvar[len(tspref):]] = locval 0522 _init() 0523 0524 0525 if __name__ == '__main__': 0526 exit_on_exception(main)