doxyqml/doxyqml/lexer.py

0001 from collections import namedtuple
0002 import re
0003
0004
0005 COMMENT = "comment"
0006 ICOMMENT = "inline_comment"
0007 STRING = "string"
0008 ELEMENT = "element"
0009 BLOCK_START = "block_start"
0010 BLOCK_END = "block_end"
0011 ARRAY_START = "array_start"
0012 ARRAY_END = "array_end"
0013 CHAR = "char"
0014 KEYWORD = "keyword"
0015 IMPORT = "import"
0016 PRAGMA = "pragma"
0017 COMPONENT = "component"
0018 ATTRIBUTE = "attribute"
0019 ELLIPSES = "ellipses"
0020
0021 # not a doxy comment
0022 PLAIN_COMMENT_RX = re.compile("/[/*][^/!*]")
0023
0024
0025 def is_doxy_comment_token(token):
0026     return token.type == COMMENT and not PLAIN_COMMENT_RX.match(token.value)
0027
0028
0029 class LexerError(Exception):
0030     def __init__(self, msg, idx):
0031         Exception.__init__(self, msg)
0032         self.idx = idx
0033
0034
0035 Token = namedtuple("Token", ["type", "value", "idx", "column"])
0036
0037
0038 class Tokenizer(object):
0039     def __init__(self, token_type, rx):
0040         self.token_type = token_type
0041         self.rx = rx
0042
0043     def __call__(self, lexer, matched_str):
0044         lexer.append_token(self.token_type, matched_str)
0045
0046
0047 class Lexer(object):
0048     def __init__(self, text):
0049         # Tokens that start at the first non-whitespace character in a line
0050         self.tokenizers_newline = [
0051             Tokenizer(COMPONENT, re.compile(r"([-\w\.]+)\s*{")),  # a component
0052             Tokenizer(ATTRIBUTE, re.compile(r"([-\w\.]+)\s*:")),  # an attribute
0053             ]
0054
0055         self.tokenizers = [
0056             Tokenizer(ICOMMENT, re.compile(r"/\*[!*]<.*?\*/", re.DOTALL)),
0057             Tokenizer(ICOMMENT, re.compile(r"//[/!]<[^\n]*(?:\n[ \t]*//[/!]<[^\n]*)*")),
0058             Tokenizer(COMMENT, re.compile(r"/\*.*?\*/", re.DOTALL)),
0059             Tokenizer(COMMENT, re.compile(r"//[^\n]*(?:\n[ \t]*//[^\n]*)*")),
0060             # A double or single quote, then either:
0061             # - anything but a double quote or a backslash
0062             # - an escaped char (\n, \t...)
0063             # then a double or single quote
0064             Tokenizer(STRING, re.compile(r'("([^\\"]|(\\.))*"|\'([^\\\']|(\\.))*\')')),
0065             Tokenizer(BLOCK_START, re.compile("(?<!')\{(?!')")),
0066             Tokenizer(BLOCK_END, re.compile("(?<!')\}(?!')")),
0067             Tokenizer(ARRAY_START, re.compile("\[")),
0068             Tokenizer(ARRAY_END, re.compile("\]")),
0069             Tokenizer(IMPORT, re.compile(r"import\s+.*")),
0070             Tokenizer(PRAGMA, re.compile(r"pragma\s+\w.*")),
0071             Tokenizer(KEYWORD, re.compile(r"(default\s+property|property|readonly\s+property|signal|enum)\s+")),
0072             Tokenizer(KEYWORD, re.compile(r"(function)\s+[^(]")),  # a named function
0073             Tokenizer(ELLIPSES, re.compile(r"\.\.\.")),
0074             Tokenizer(ELEMENT, re.compile(r"\w[\w.<>]*")),
0075             Tokenizer(CHAR, re.compile(".")),
0076             ]
0077         self.text = text.replace('\\\n', '\n')
0078         self.idx = 0
0079         self.column = 0
0080         self.newline = False
0081         self.tokens = []
0082
0083     def tokenize(self):
0084         while True:
0085             self.advance()
0086             if self.idx == len(self.text):
0087                 break
0088             self.apply_tokenizers()
0089         self.fixup_tokens()
0090
0091     def advance(self):
0092         self.newline = False
0093         if self.idx == 0:
0094             # Process start-of-file as newline.
0095             self.newline = True
0096
0097         while self.idx < len(self.text):
0098             if self.text[self.idx] == '\n':
0099                 self.newline = True
0100                 self.idx += 1
0101                 self.column = 0
0102             elif self.text[self.idx].isspace():
0103                 self.idx += 1
0104                 self.column += 1
0105             else:
0106                 break
0107
0108     def apply_tokenizers(self):
0109         if self.newline:
0110             for tokenizer in self.tokenizers_newline:
0111                 match = tokenizer.rx.match(self.text, self.idx)
0112
0113                 if not match:
0114                     continue
0115
0116                 if len(match.groups()) > 0:
0117                     tokenizer(self, match.group(1))
0118                     self.set_position(match.end(1))
0119                     return
0120                 else:
0121                     tokenizer(self, match.group(0))
0122                     self.set_position(match.end(0))
0123                     return
0124
0125         for tokenizer in self.tokenizers:
0126             match = tokenizer.rx.match(self.text, self.idx)
0127
0128             if not match:
0129                 continue
0130
0131             if len(match.groups()) > 0:
0132                 tokenizer(self, match.group(1))
0133                 self.set_position(match.end(1))
0134                 return
0135             else:
0136                 tokenizer(self, match.group(0))
0137                 self.set_position(match.end(0))
0138                 return
0139
0140         raise LexerError("No lexer matched", self.idx)
0141
0142     def fixup_tokens(self):
0143         for idx, token in enumerate(self.tokens):
0144             # Fix tokenization of a property named "property". For example:
0145             #   property string property: "foo"
0146             if (token.type == KEYWORD and token.value == "property" and idx > 1 and
0147                     self.tokens[idx - 1].type == ELEMENT and
0148                     self.tokens[idx - 2].type == KEYWORD and self.tokens[idx - 2].value.endswith("property")):
0149                 self.tokens[idx] = Token(ELEMENT, token.value, token.idx, token.column)
0150             if token.type == COMMENT or token.type == ICOMMENT:
0151                 self.left_shift_comment(idx)
0152             if token.type == ICOMMENT and idx > 1:
0153                 self.move_inline_comments(idx)
0154
0155     def left_shift_comment(self, idx):
0156         """
0157         Change the value of multiline-tokens so they look like they were
0158         defined on column 1 instead of wherever they were.
0159         """
0160         token = self.tokens.pop(idx)
0161         if token.column < 1:
0162             self.tokens.insert(idx, token)
0163             return
0164         rx = re.compile(r"^[ \t]{{{}}}".format(token.column), re.MULTILINE)
0165         newval = rx.sub("", token.value)
0166         self.tokens.insert(idx, Token(token.type, newval, token.idx, token.column))
0167
0168     def move_inline_comments(self, start_idx):
0169         """
0170         Move inline comments ahead of their parent KEYWORD. This way they get
0171         properly handed over to the Qml* object type handlers which can do with
0172         them as they wish.
0173         """
0174         # Iterate backwards looking for a KEYWORD. As a sanity measure we only
0175         # search back up to 20 tokens or until an "invalid" token is found.
0176         end_idx = max(start_idx - 20, 0)
0177         for idx, token in enumerate(self.tokens[start_idx - 1:end_idx:-1]):
0178             if token.type == KEYWORD:
0179                 ins_idx = start_idx - idx - 1
0180                 if ins_idx <= 0:
0181                     return
0182                 break
0183             if token.type in (COMMENT, ICOMMENT, IMPORT, PRAGMA):
0184                 return
0185
0186         # Final sanity check for a misplaced inline comment
0187         previous_token = self.tokens[ins_idx - 1]
0188         if previous_token.type == ICOMMENT or is_doxy_comment_token(previous_token):
0189             return
0190
0191         self.tokens.insert(ins_idx, self.tokens.pop(start_idx))
0192
0193     def append_token(self, type, value):
0194         self.tokens.append(Token(type, value, self.idx, self.column))
0195
0196     def set_position(self, idx):
0197         self.idx = idx
0198         newline = self.text.rfind("\n", 0, idx)
0199         if newline == -1:
0200             self.column = idx
0201         else:
0202             self.column = idx - newline - 1