sdk/kde-dev-scripts/grantlee_strings_extractor.py

0001 #! /usr/bin/env python
0002 # -*- coding: utf-8 -*-
0003
0004 ##
0005 # Copyright 2010,2011 Stephen Kelly <steveire@gmail.com>
0006 #
0007 # Redistribution and use in source and binary forms, with or without
0008 # modification, are permitted provided that the following conditions
0009 # are met:
0010 #
0011 # 1. Redistributions of source code must retain the above copyright
0012 #    notice, this list of conditions and the following disclaimer.
0013 # 2. Redistributions in binary form must reproduce the above copyright
0014 #    notice, this list of conditions and the following disclaimer in the
0015 #    documentation and/or other materials provided with the distribution.
0016 #
0017 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
0018 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
0019 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
0020 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
0021 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
0022 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
0023 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
0024 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0025 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
0026 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0027 ##
0028
0029 ## Parts of this file are reproduced from the Django framework. The Django license appears below.
0030
0031 ##
0032 # Copyright (c) Django Software Foundation and individual contributors.
0033 # All rights reserved.
0034 #
0035 # Redistribution and use in source and binary forms, with or without modification,
0036 # are permitted provided that the following conditions are met:
0037 #
0038 #     1. Redistributions of source code must retain the above copyright notice,
0039 #        this list of conditions and the following disclaimer.
0040 #
0041 #     2. Redistributions in binary form must reproduce the above copyright
0042 #        notice, this list of conditions and the following disclaimer in the
0043 #        documentation and/or other materials provided with the distribution.
0044 #
0045 #     3. Neither the name of Django nor the names of its contributors may be used
0046 #        to endorse or promote products derived from this software without
0047 #        specific prior written permission.
0048 #
0049 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0050 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0051 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0052 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
0053 # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0054 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0055 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
0056 # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0057 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0058 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0059 ##
0060
0061 import os, sys, glob, operator
0062 import re
0063 import os.path
0064
0065 # == Introduction to the template syntax ==
0066 #
0067 # The template syntax looks like this:
0068 # (For more see here: http://grantlee.org/apidox/for_themers.html )
0069 #
0070 # This is plain text
0071 # This is text with a {{ value }} substitution
0072 # This is {% if condition_is_met %}a conditional{% endif %}
0073 # {# This is a comment #}
0074 # This is a {% comment %} multi-line
0075 # comment
0076 # {% endcomment %}
0077 #
0078 # That is, we have plain text.
0079 # We have value substitution with {{ }}
0080 # We have comments with {# #}
0081 # We have control tags with {% %}
0082 #
0083 # The first token inside {% %} syntax is called a tag name. Above, we have
0084 # an if tag and a comment tag.
0085 #
0086 # The 'value' in {{ value }} is called a filter expression. In the above case
0087 # the filter expression is a simple value which was inserted into the context.
0088 # In other cases it can be {{ value|upper }}, that is the value can be passed
0089 # through a filter called 'upper' with the '|', or filter expression can
0090 # be {{ value|join:"-" }}, that is it can be passed through the join filter
0091 # which takes an argument. In this case, the 'value' would actually be a list,
0092 # and the join filter would concatenate them with a dash. A filter can have
0093 # either no arguments, like upper, or it can take one argument, delimited by
0094 # a colon (';'). A filter expression can consist of a value followed by a
0095 # chain of filters, such as {{ value|join:"-"|upper }}. A filter expression
0096 # can appear one time inside {{ }} but may appear multiple times inside {% %}
0097 # For example {% cycle foo|upper bar|join:"-" bat %} contains 3 filter
0098 # expressions, 'foo|upper', 'bar|join:"-"' and 'bat'.
0099 #
0100 # Comments are ignored in the templates.
0101 #
0102 # == i18n in templates ==
0103 #
0104 # The purpose of this script is to extract translatable strings from templates
0105 # The aim is to allow template authors to write templates like this:
0106 #
0107 # This is a {{ _("translatable string") }} in the template.
0108 # This is a {% i18n "translatable string about %1" something %}
0109 # This is a {% i18nc "Some context information" "string about %1" something %}
0110 # This is a {% i18np "%1 string about %2" numthings something %}
0111 # This is a {% i18ncp "some context" "%1 string about %2" numthings something %}
0112 #
0113 # That is, simple translation with _(), and i18n* tags to allow for variable
0114 # substitution, context messages and plurals. Translatable strings may appear
0115 # in a filter expression, either as the value begin filtered, or as the argument
0116 # or both:
0117 #
0118 # {{ _("hello")|upper }}
0119 # {{ list|join:_("and") }}
0120 #
0121 # == How the strings are extracted ==
0122 #
0123 # The strings are extracted by parsing the template with regular expressions.
0124 # The tag_re regular expression breaks the template into a stream of tokens
0125 # containing plain text, {{ values }} and {% tags %}.
0126 # That work is done by the tokenize method with the create_token method.
0127 # Each token is then processed to extract the translatable strings from
0128 # the filter expressions.
0129
0130
0131 # The original context of much of this script is in the django template system:
0132 # https://github.com/django/django/blob/master/django/template/base.py
0133
0134
0135 TOKEN_TEXT = 0
0136 TOKEN_VAR = 1
0137 TOKEN_BLOCK = 2
0138 TOKEN_COMMENT = 3
0139
0140 # template syntax constants
0141 FILTER_SEPARATOR = '|'
0142 FILTER_ARGUMENT_SEPARATOR = ':'
0143 BLOCK_TAG_START = '{%'
0144 BLOCK_TAG_END = '%}'
0145 VARIABLE_TAG_START = '{{'
0146 VARIABLE_TAG_END = '}}'
0147 COMMENT_TAG_START = '{#'
0148 COMMENT_TAG_END = '#}'
0149
0150 # match a variable or block tag and capture the entire tag, including start/end delimiters
0151 tag_re = re.compile('(%s.*?%s|%s.*?%s)' % (re.escape(BLOCK_TAG_START), re.escape(BLOCK_TAG_END),
0152                                           re.escape(VARIABLE_TAG_START), re.escape(VARIABLE_TAG_END)))
0153
0154
0155 # Expression to match some_token and some_token="with spaces" (and similarly
0156 # for single-quoted strings).
0157 smart_split_re = re.compile(r"""
0158     ((?:
0159         [^\s'"]*
0160         (?:
0161             (?:"(?:[^"\\]|\\.)*" | '(?:[^'\\]|\\.)*')
0162             [^\s'"]*
0163         )+
0164     ) | \S+)
0165 """, re.VERBOSE)
0166
0167 def smart_split(text):
0168     r"""
0169     Generator that splits a string by spaces, leaving quoted phrases together.
0170     Supports both single and double quotes, and supports escaping quotes with
0171     backslashes. In the output, strings will keep their initial and trailing
0172     quote marks and escaped quotes will remain escaped (the results can then
0173     be further processed with unescape_string_literal()).
0174
0175     >>> list(smart_split(r'This is "a person\'s" test.'))
0176     [u'This', u'is', u'"a person\\\'s"', u'test.']
0177     >>> list(smart_split(r"Another 'person\'s' test."))
0178     [u'Another', u"'person\\'s'", u'test.']
0179     >>> list(smart_split(r'A "\"funky\" style" test.'))
0180     [u'A', u'"\\"funky\\" style"', u'test.']
0181     """
0182     for bit in smart_split_re.finditer(text):
0183         yield bit.group(0)
0184
0185
0186 # This only matches constant *strings* (things in quotes or marked for
0187 # translation).
0188
0189 constant_string = r"(?:%(strdq)s|%(strsq)s)" % {
0190     'strdq': r'"[^"\\]*(?:\\.[^"\\]*)*"', # double-quoted string
0191     'strsq': r"'[^'\\]*(?:\\.[^'\\]*)*'", # single-quoted string
0192     }
0193
0194 filter_raw_string = r"""^%(i18n_open)s(?P<l10nable>%(constant_string)s)%(i18n_close)s""" % {
0195     'constant_string': constant_string,
0196     'i18n_open' : re.escape("_("),
0197     'i18n_close' : re.escape(")"),
0198   }
0199
0200 filter_re = re.compile(filter_raw_string, re.UNICODE|re.VERBOSE)
0201
0202 class TemplateSyntaxError(Exception):
0203     pass
0204
0205 class TranslatableString:
0206     _string = ''
0207     context = ''
0208     plural = ''
0209     line_number = -1
0210
0211     def __repr__(self):
0212         return "String('%s', '%s', '%s')" % (self._string, self.context, self.plural)
0213
0214 class Token(object):
0215     def __init__(self, token_type, contents):
0216         # token_type must be TOKEN_TEXT, TOKEN_VAR, TOKEN_BLOCK or TOKEN_COMMENT.
0217         self.token_type, self.contents = token_type, contents
0218
0219     def __str__(self):
0220         return '<%s token: "%s...">' % \
0221             ({TOKEN_TEXT: 'Text', TOKEN_VAR: 'Var', TOKEN_BLOCK: 'Block', TOKEN_COMMENT: 'Comment'}[self.token_type],
0222             self.contents[:20].replace('\n', ''))
0223
0224 def create_token(token_string, in_tag):
0225     """
0226     Convert the given token string into a new Token object and return it.
0227     If in_tag is True, we are processing something that matched a tag,
0228     otherwise it should be treated as a literal string.
0229     """
0230     if in_tag:
0231         if token_string.startswith(VARIABLE_TAG_START):
0232             token = Token(TOKEN_VAR, token_string[len(VARIABLE_TAG_START):-len(VARIABLE_TAG_END)].strip())
0233         elif token_string.startswith(BLOCK_TAG_START):
0234             token = Token(TOKEN_BLOCK, token_string[len(BLOCK_TAG_START):-len(BLOCK_TAG_END)].strip())
0235         elif token_string.startswith(COMMENT_TAG_START):
0236             token = Token(TOKEN_COMMENT, '')
0237     else:
0238         token = Token(TOKEN_TEXT, token_string)
0239     return token
0240
0241 def tokenize(template_string):
0242
0243     in_tag = False
0244     result = []
0245     for bit in tag_re.split(template_string):
0246         if bit:
0247             result.append(create_token(bit, in_tag))
0248         in_tag = not in_tag
0249     return result
0250
0251 class TranslationOutputter:
0252     translatable_strings = []
0253     line_number = 0
0254
0255     def get_translatable_filter_args(self, token):
0256         """
0257         Find the filter expressions in token and extract the strings in it.
0258         """
0259         matches = filter_re.finditer(token)
0260         upto = 0
0261         var_obj = False
0262         for match in matches:
0263             l10nable = match.group("l10nable")
0264
0265             if l10nable:
0266                 # Make sure it's a quoted string
0267                 if l10nable.startswith('"') and l10nable.endswith('"') \
0268                         or l10nable.startswith("'") and l10nable.endswith("'"):
0269                     ts = TranslatableString()
0270                     ts._string = l10nable[1:-1]
0271                     ts.line_number = self.line_number
0272                     self.translatable_strings.append(ts)
0273
0274     def get_contextual_strings(self, token):
0275         split = []
0276         _bits = smart_split(token.contents)
0277         _bit = next(_bits)
0278         if _bit =="i18n" or _bit == "i18n_var":
0279             # {% i18n "A one %1, a two %2, a three %3" var1 var2 var3 %}
0280             # {% i18n_var "A one %1, a two %2, a three %3" var1 var2 var3 as result %}
0281             _bit = next(_bits)
0282             if not _bit.startswith("'") and not _bit.startswith('"'):
0283                 return
0284
0285             sentinal = _bit[0]
0286             if not _bit.endswith(sentinal):
0287                 return
0288
0289             translatable_string = TranslatableString()
0290             translatable_string._string = _bit[1:-1]
0291             translatable_string.line_number = self.line_number
0292             self.translatable_strings.append(translatable_string)
0293         elif _bit =="i18nc" or _bit == "i18nc_var":
0294             # {% i18nc "An email send operation failed." "%1 Failed!" var1 %}
0295             # {% i18nc_var "An email send operation failed." "%1 Failed!" var1 as result %}
0296             _bit = next(_bits)
0297             if not _bit.startswith("'") and not _bit.startswith('"'):
0298                 return
0299
0300             sentinal = _bit[0]
0301             if not _bit.endswith(sentinal):
0302                 return
0303
0304             translatable_string = TranslatableString()
0305             translatable_string.context = _bit[1:-1]
0306             _bit = next(_bits)
0307             translatable_string._string = _bit[1:-1]
0308             translatable_string.line_number = self.line_number
0309             self.translatable_strings.append(translatable_string)
0310         elif _bit =="i18np" or _bit =="i18np_var":
0311             # {% i18np "An email send operation failed." "%1 email send operations failed. Error : % 2." count count errorMsg %}
0312             # {% i18np_var "An email send operation failed." "%1 email send operations failed. Error : % 2." count count errorMsg as result %}
0313             _bit = next(_bits)
0314             if not _bit.startswith("'") and not _bit.startswith('"'):
0315                 return
0316
0317             sentinal = _bit[0]
0318             if not _bit.endswith(sentinal):
0319                 return
0320
0321             translatable_string = TranslatableString()
0322             translatable_string._string = _bit[1:-1]
0323             _bit = next(_bits)
0324             translatable_string.plural = _bit[1:-1]
0325             translatable_string.line_number = self.line_number
0326             self.translatable_strings.append(translatable_string)
0327         elif _bit =="i18ncp" or _bit =="i18ncp_var":
0328             # {% i18np "The user tried to send an email, but that failed." "An email send operation failed." "%1 email send operation failed." count count %}
0329             # {% i18np_var "The user tried to send an email, but that failed." "An email send operation failed." "%1 email send operation failed." count count as result %}
0330
0331             _bit = next(_bits)
0332             if not _bit.startswith("'") and not _bit.startswith('"'):
0333                 return
0334
0335             sentinal = _bit[0]
0336             if not _bit.endswith(sentinal):
0337                 return
0338
0339             translatable_string = TranslatableString()
0340             translatable_string.context = _bit[1:-1]
0341             _bit = next(_bits)
0342             translatable_string._string = _bit[1:-1]
0343             _bit = next(_bits)
0344             translatable_string.plural = _bit[1:-1]
0345             translatable_string.line_number = self.line_number
0346             self.translatable_strings.append(translatable_string)
0347         else:
0348           return
0349
0350         for _bit in _bits:
0351
0352             if (_bit == "as"):
0353                 return
0354             self.get_translatable_filter_args(_bit)
0355
0356     def get_plain_strings(self, token):
0357         split = []
0358         bits = iter(smart_split(token.contents))
0359         for bit in bits:
0360             self.get_translatable_filter_args(bit)
0361
0362     def translate(self, template_file, outputfile):
0363         self.translatable_strings = []
0364         self.line_number = 0
0365         template_string_lines = template_file.readlines()
0366         for template_string_line in template_string_lines:
0367           self.line_number += 1
0368           for token in tokenize(template_string_line):
0369               if token.token_type == TOKEN_VAR or token.token_type == TOKEN_BLOCK:
0370                   self.get_plain_strings(token)
0371               if token.token_type == TOKEN_BLOCK:
0372                   self.get_contextual_strings(token)
0373         self.createOutput(template_file.name, self.translatable_strings, outputfile)
0374
0375     def createOutput(self, template_filename, translatable_strings, outputfile):
0376
0377         for translatable_string in translatable_strings:
0378             outputfile.write("// i18n: file: " + template_filename + ":" + str(translatable_string.line_number) + "\n")
0379             if translatable_string.context:
0380                 if not translatable_string.plural:
0381                     outputfile.write("pgettext(\"" + translatable_string.context + "\", \"" + translatable_string._string + "\");\n")
0382                 else:
0383                     outputfile.write("npgettext(\"" + translatable_string.context + "\", \"" + translatable_string._string + "\", \"" + translatable_string.plural + "\");\n")
0384             else:
0385                 if translatable_string.plural:
0386                     outputfile.write("ngettext(\"" + translatable_string._string + "\", \"" + translatable_string.plural + "\");\n")
0387                 else:
0388                     outputfile.write("gettext(\"" + translatable_string._string + "\");\n")
0389
0390
0391 if __name__ == "__main__":
0392   ex = TranslationOutputter()
0393
0394   outputfile = sys.stdout
0395
0396   files = sys.argv[1:]
0397
0398   for filename in files:
0399     f = open(filename, "r")
0400     ex.translate(f, outputfile)
0401
0402   outputfile.write("\n")