File indexing completed on 2024-04-14 05:34:18
0001 from collections import namedtuple 0002 import re 0003 0004 0005 COMMENT = "comment" 0006 ICOMMENT = "inline_comment" 0007 STRING = "string" 0008 ELEMENT = "element" 0009 BLOCK_START = "block_start" 0010 BLOCK_END = "block_end" 0011 ARRAY_START = "array_start" 0012 ARRAY_END = "array_end" 0013 CHAR = "char" 0014 KEYWORD = "keyword" 0015 IMPORT = "import" 0016 PRAGMA = "pragma" 0017 COMPONENT = "component" 0018 ATTRIBUTE = "attribute" 0019 ELLIPSES = "ellipses" 0020 0021 # not a doxy comment 0022 PLAIN_COMMENT_RX = re.compile("/[/*][^/!*]") 0023 0024 0025 def is_doxy_comment_token(token): 0026 return token.type == COMMENT and not PLAIN_COMMENT_RX.match(token.value) 0027 0028 0029 class LexerError(Exception): 0030 def __init__(self, msg, idx): 0031 Exception.__init__(self, msg) 0032 self.idx = idx 0033 0034 0035 Token = namedtuple("Token", ["type", "value", "idx", "column"]) 0036 0037 0038 class Tokenizer(object): 0039 def __init__(self, token_type, rx): 0040 self.token_type = token_type 0041 self.rx = rx 0042 0043 def __call__(self, lexer, matched_str): 0044 lexer.append_token(self.token_type, matched_str) 0045 0046 0047 class Lexer(object): 0048 def __init__(self, text): 0049 # Tokens that start at the first non-whitespace character in a line 0050 self.tokenizers_newline = [ 0051 Tokenizer(COMPONENT, re.compile(r"([-\w\.]+)\s*{")), # a component 0052 Tokenizer(ATTRIBUTE, re.compile(r"([-\w\.]+)\s*:")), # an attribute 0053 ] 0054 0055 self.tokenizers = [ 0056 Tokenizer(ICOMMENT, re.compile(r"/\*[!*]<.*?\*/", re.DOTALL)), 0057 Tokenizer(ICOMMENT, re.compile(r"//[/!]<[^\n]*(?:\n[ \t]*//[/!]<[^\n]*)*")), 0058 Tokenizer(COMMENT, re.compile(r"/\*.*?\*/", re.DOTALL)), 0059 Tokenizer(COMMENT, re.compile(r"//[^\n]*(?:\n[ \t]*//[^\n]*)*")), 0060 # A double or single quote, then either: 0061 # - anything but a double quote or a backslash 0062 # - an escaped char (\n, \t...) 0063 # then a double or single quote 0064 Tokenizer(STRING, re.compile(r'("([^\\"]|(\\.))*"|\'([^\\\']|(\\.))*\')')), 0065 Tokenizer(BLOCK_START, re.compile("(?<!')\{(?!')")), 0066 Tokenizer(BLOCK_END, re.compile("(?<!')\}(?!')")), 0067 Tokenizer(ARRAY_START, re.compile("\[")), 0068 Tokenizer(ARRAY_END, re.compile("\]")), 0069 Tokenizer(IMPORT, re.compile(r"import\s+.*")), 0070 Tokenizer(PRAGMA, re.compile(r"pragma\s+\w.*")), 0071 Tokenizer(KEYWORD, re.compile(r"(default\s+property|property|readonly\s+property|signal|enum)\s+")), 0072 Tokenizer(KEYWORD, re.compile(r"(function)\s+[^(]")), # a named function 0073 Tokenizer(ELLIPSES, re.compile(r"\.\.\.")), 0074 Tokenizer(ELEMENT, re.compile(r"\w[\w.<>]*")), 0075 Tokenizer(CHAR, re.compile(".")), 0076 ] 0077 self.text = text.replace('\\\n', '\n') 0078 self.idx = 0 0079 self.column = 0 0080 self.newline = False 0081 self.tokens = [] 0082 0083 def tokenize(self): 0084 while True: 0085 self.advance() 0086 if self.idx == len(self.text): 0087 break 0088 self.apply_tokenizers() 0089 self.fixup_tokens() 0090 0091 def advance(self): 0092 self.newline = False 0093 if self.idx == 0: 0094 # Process start-of-file as newline. 0095 self.newline = True 0096 0097 while self.idx < len(self.text): 0098 if self.text[self.idx] == '\n': 0099 self.newline = True 0100 self.idx += 1 0101 self.column = 0 0102 elif self.text[self.idx].isspace(): 0103 self.idx += 1 0104 self.column += 1 0105 else: 0106 break 0107 0108 def apply_tokenizers(self): 0109 if self.newline: 0110 for tokenizer in self.tokenizers_newline: 0111 match = tokenizer.rx.match(self.text, self.idx) 0112 0113 if not match: 0114 continue 0115 0116 if len(match.groups()) > 0: 0117 tokenizer(self, match.group(1)) 0118 self.set_position(match.end(1)) 0119 return 0120 else: 0121 tokenizer(self, match.group(0)) 0122 self.set_position(match.end(0)) 0123 return 0124 0125 for tokenizer in self.tokenizers: 0126 match = tokenizer.rx.match(self.text, self.idx) 0127 0128 if not match: 0129 continue 0130 0131 if len(match.groups()) > 0: 0132 tokenizer(self, match.group(1)) 0133 self.set_position(match.end(1)) 0134 return 0135 else: 0136 tokenizer(self, match.group(0)) 0137 self.set_position(match.end(0)) 0138 return 0139 0140 raise LexerError("No lexer matched", self.idx) 0141 0142 def fixup_tokens(self): 0143 for idx, token in enumerate(self.tokens): 0144 # Fix tokenization of a property named "property". For example: 0145 # property string property: "foo" 0146 if (token.type == KEYWORD and token.value == "property" and idx > 1 and 0147 self.tokens[idx - 1].type == ELEMENT and 0148 self.tokens[idx - 2].type == KEYWORD and self.tokens[idx - 2].value.endswith("property")): 0149 self.tokens[idx] = Token(ELEMENT, token.value, token.idx, token.column) 0150 if token.type == COMMENT or token.type == ICOMMENT: 0151 self.left_shift_comment(idx) 0152 if token.type == ICOMMENT and idx > 1: 0153 self.move_inline_comments(idx) 0154 0155 def left_shift_comment(self, idx): 0156 """ 0157 Change the value of multiline-tokens so they look like they were 0158 defined on column 1 instead of wherever they were. 0159 """ 0160 token = self.tokens.pop(idx) 0161 if token.column < 1: 0162 self.tokens.insert(idx, token) 0163 return 0164 rx = re.compile(r"^[ \t]{{{}}}".format(token.column), re.MULTILINE) 0165 newval = rx.sub("", token.value) 0166 self.tokens.insert(idx, Token(token.type, newval, token.idx, token.column)) 0167 0168 def move_inline_comments(self, start_idx): 0169 """ 0170 Move inline comments ahead of their parent KEYWORD. This way they get 0171 properly handed over to the Qml* object type handlers which can do with 0172 them as they wish. 0173 """ 0174 # Iterate backwards looking for a KEYWORD. As a sanity measure we only 0175 # search back up to 20 tokens or until an "invalid" token is found. 0176 end_idx = max(start_idx - 20, 0) 0177 for idx, token in enumerate(self.tokens[start_idx - 1:end_idx:-1]): 0178 if token.type == KEYWORD: 0179 ins_idx = start_idx - idx - 1 0180 if ins_idx <= 0: 0181 return 0182 break 0183 if token.type in (COMMENT, ICOMMENT, IMPORT, PRAGMA): 0184 return 0185 0186 # Final sanity check for a misplaced inline comment 0187 previous_token = self.tokens[ins_idx - 1] 0188 if previous_token.type == ICOMMENT or is_doxy_comment_token(previous_token): 0189 return 0190 0191 self.tokens.insert(ins_idx, self.tokens.pop(start_idx)) 0192 0193 def append_token(self, type, value): 0194 self.tokens.append(Token(type, value, self.idx, self.column)) 0195 0196 def set_position(self, idx): 0197 self.idx = idx 0198 newline = self.text.rfind("\n", 0, idx) 0199 if newline == -1: 0200 self.column = idx 0201 else: 0202 self.column = idx - newline - 1