File indexing completed on 2024-12-08 04:20:20
0001 # -*- coding: utf-8 -*- 0002 # Copyright 2007-8 Jim Bublitz <jbublitz@nwinternet.com> 0003 # Copyright 2009 Simon Edwards <simon@simonzone.com> 0004 # 0005 # This program is free software; you can redistribute it and/or modify 0006 # it under the terms of the GNU General Public License as published by 0007 # the Free Software Foundation; either version 2 of the License, or 0008 # (at your option) any later version. 0009 # 0010 # This program is distributed in the hope that it will be useful, 0011 # but WITHOUT ANY WARRANTY; without even the implied warranty of 0012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 0013 # GNU General Public License for more details. 0014 # 0015 # You should have received a copy of the GNU General Public License 0016 # along with this program; if not, write to the 0017 # Free Software Foundation, Inc., 0018 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 0019 0020 import ply.lex as lex 0021 0022 class CppLexerClass(object): 0023 def __init__(self): 0024 self.codeStack = [] 0025 self.parenStack = [] 0026 self._bareMacros = [] 0027 self._macros = [] 0028 0029 def setBareMacros(self,macroList): 0030 # macroList - List of strings 0031 self._bareMacros = macroList 0032 0033 def setMacros(self,macroList): 0034 # macroList - List of strings 0035 self._macros = macroList 0036 0037 states = (('function', 'inclusive'), ('macro', 'exclusive'), ('operator', 'inclusive'), ('variable', 'inclusive'),\ 0038 ('stmt', 'exclusive'), ('enum', 'inclusive')) 0039 0040 accessSpecifiers = ("private", "protected", "public", "slots", "signals") 0041 0042 edges = ("class", "struct", "union", "template", "enum", "namespace",\ 0043 "typedef", "operator", 'typename') 0044 0045 storageQualifiers = ("auto", "register", "static", "extern", "mutable") 0046 0047 functionQualifiers = ("virtual", "explicit") 0048 0049 cvQualifiers = ("const", "volatile") 0050 0051 cppScalarTypes = ("int", "char", "float", "double", "long", "short", "unsigned",\ 0052 "signed", "bool", "void", "wchar_t") 0053 0054 # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=) 0055 operators = ('PLUS', 'MINUS', 'SLASH', 'PERCENT', 'VBAR', 'CARET', #'LSHIFT', 'RSHIFT', 0056 'LOR', 'LAND', 'BANG', 'LE', 'GE', 'EQ', 'NE', 0057 # Increment/decrement (++,--) 0058 'PLUSPLUS', 'MINUSMINUS', 0059 # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=) 0060 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL', 0061 'LSHIFTEQUAL',#'RSHIFTEQUAL', 0062 'ANDEQUAL', 'XOREQUAL', 'OREQUAL', 'new', 'static_cast' 0063 ) 0064 0065 doc = ('DOC', 'UPDOC') 0066 0067 tokens = accessSpecifiers + edges + functionQualifiers + cppScalarTypes + operators + doc + ( 0068 # Literals (identifier, integer constant, float constant, string constant, char const) 0069 'ID', 'ICONST', 'HEXCONST', 'FCONST', 'SCONST', 'CCONST', 'CVQUAL', 0070 'STORAGE', 'PURESFX', 'CODE_TOKEN', 'STMT_BEGIN', 'STMT_END', 'CODE_STMT_BEGIN', 0071 0072 # Expressions we don't parse 0073 'ARRAYOP', 'FUNCPTR', 'BAREMACRO', 'MACROCALL', 'MACRO_ELEMENT', 'MACRO_CALL_BEGIN', 'MACRO_CALL_END', 0074 0075 # Structure dereference (->) 0076 'ARROW', 0077 0078 # Treat separately from other operators 0079 'EQUALS', 'ASTERISK', 'AMPERSAND', 'TILDE', 'LT', 'GT', 0080 0081 # Delimeters ( ) [ ] { } , . ; : :: 0082 'LPAREN', 'RPAREN', 0083 #'LBRACKET', 'RBRACKET', 0084 'LBRACE', 'RBRACE', 0085 'COMMA', 'PERIOD', 'SEMI', 'COLON', 'COLON2', 0086 0087 # Ellipsis (...) 0088 'ELLIPSIS' 0089 ) 0090 0091 # Completely ignored characters 0092 t_ANY_ignore = ' \t\x0c' 0093 #t_ANY_ignore_typename = 'typename' 0094 0095 # Operators 0096 t_PLUS = r'\+' 0097 t_MINUS = r'-' 0098 t_ASTERISK = r'\*' 0099 t_SLASH = r'/' 0100 t_PERCENT = r'%' 0101 t_VBAR = r'\|' 0102 t_AMPERSAND = r'&' 0103 t_TILDE = r'~' 0104 t_CARET = r'\^' 0105 #t_LSHIFT = r'<<' 0106 #t_RSHIFT = r'>>' 0107 t_LOR = r'\|\|' 0108 t_LAND = r'&&' 0109 t_BANG = r'!' 0110 t_LT = r'<' 0111 t_GT = r'>' 0112 t_LE = r'<=' 0113 t_GE = r'>=' 0114 t_EQ = r'==' 0115 t_NE = r'!=' 0116 0117 # Assignment operators 0118 0119 t_EQUALS = r'=' 0120 t_TIMESEQUAL = r'\*=' 0121 t_DIVEQUAL = r'/=' 0122 t_MODEQUAL = r'%=' 0123 t_PLUSEQUAL = r'\+=' 0124 t_MINUSEQUAL = r'-=' 0125 t_LSHIFTEQUAL = r'<<=' 0126 #t_RSHIFTEQUAL = r'>>=' 0127 t_ANDEQUAL = r'&=' 0128 t_OREQUAL = r'\|=' 0129 t_XOREQUAL = r'^=' 0130 0131 # Increment/decrement 0132 t_PLUSPLUS = r'\+\+' 0133 t_MINUSMINUS = r'--' 0134 0135 # -> 0136 t_ARROW = r'->' 0137 0138 # Delimeters 0139 t_LPAREN = r'\(' 0140 0141 def t_macro_LPAREN(self,t): 0142 r'\(' 0143 if not self.parenStack: 0144 t.type = 'MACRO_CALL_BEGIN' 0145 self.parenStack.append (True) 0146 else: 0147 t.type = 'LPAREN' 0148 self.parenStack.append (False) 0149 0150 return t 0151 0152 t_RPAREN = r'\)' 0153 0154 def t_macro_RPAREN(self,t): 0155 r'\)' 0156 if self.parenStack.pop (): 0157 t.type = 'MACRO_CALL_END' 0158 t.lexer.begin ('variable') 0159 else: 0160 t.type = 'RPAREN' 0161 return t 0162 0163 def t_macro_MACRO_ELEMENT(self,t): 0164 r'[^\(\)\s]+' 0165 t.lexer.lineno += t.value.count ('\n') 0166 return t 0167 0168 #t_LBRACKET = r'\[' 0169 #t_RBRACKET = r'\]' 0170 0171 def t_ANY_LBRACE(self,t): 0172 r'\{' 0173 if t.lexer.lexstate in ['operator', 'function']: 0174 t.type = 'STMT_BEGIN' 0175 self.codeStack.append (t.lexer.lexstate) 0176 t.lexer.begin ('stmt') 0177 elif t.lexer.lexstate == 'stmt': 0178 t.type = 'CODE_STMT_BEGIN' 0179 self.codeStack.append ('stmt') 0180 else: 0181 t.type = 'LBRACE' 0182 return t 0183 0184 def t_ANY_RBRACE(self,t): 0185 r'\}' 0186 if t.lexer.lexstate in ['stmt', 'function', 'operator']: 0187 if self.codeStack: 0188 t.lexer.begin(self.codeStack.pop ()) 0189 t.type = 'STMT_END' 0190 else: 0191 t.type = 'RBRACE' 0192 else: 0193 t.type = 'RBRACE' 0194 return t 0195 0196 def t_stmt_CODE_TOKEN(self,t): 0197 r'[^{}\s]+' 0198 if '\n' in t.value: 0199 t.lexer.lineno += t.value.count ('\n') 0200 return t 0201 0202 t_COMMA = r',' 0203 t_PERIOD = r'\.' 0204 t_SEMI = r';' 0205 t_COLON = r':' 0206 t_ELLIPSIS = r'\.\.\.' 0207 t_COLON2 = r'::' 0208 0209 # Hex Literal 0210 def t_HEXCONST(self,t): 0211 r'0[x|X][\da-fA-F]+' 0212 t.type = 'HEXCONST' 0213 return t 0214 0215 # Integer literal 0216 t_ICONST = r'\d+([uUlL])?([uUlL])?' #r'(0(?![x|X])|[1-9])\d*([uU]|[lL]|[uU][lL]|[lL][uU])?' 0217 0218 # Floating literal 0219 t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?' 0220 0221 t_function_PURESFX = r'0;' 0222 0223 t_operator_PURESFX = r'0;' 0224 0225 # Array operator 0226 t_ARRAYOP = r'[[].*[]]' 0227 0228 # Function pointer 0229 t_FUNCPTR = r'\(\s*\*' 0230 0231 # String literal 0232 t_SCONST = r'\"([^\\\n]|(\\.))*?\"' 0233 0234 # Character constant 'c' or L'c' 0235 t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\'' 0236 # Newlines 0237 0238 # some things we ignore (entire line) 0239 def t_friend_class(self,t): 0240 r'friend\s+class\s+[^;]*;?' 0241 t.lexer.lineno += t.value.count("\n") 0242 0243 def t_friend(self,t): 0244 r'friend\s' 0245 pass 0246 0247 def t_using(self,t): 0248 r'using\s+.*;?' 0249 t.lexer.lineno += t.value.count ('\n') 0250 0251 def t_inline(self,t): 0252 r'inline\s+' 0253 0254 def t_ANY_NEWLINE(self,t): 0255 r'(\\\n|\n)+' 0256 t.lexer.lineno += t.value.count("\n") 0257 0258 def t_ID(self,t): 0259 r'[A-Za-z_][\w_]*' 0260 if t.value in ['class', 'namespace']: 0261 t.lexer.begin ('variable') 0262 0263 if t.value in CppLexerClass.edges or t.value in ['new', 'static_cast']: 0264 t.type = t.value 0265 if t.type == 'operator': 0266 t.lexer.begin ('operator') 0267 if t.type == 'enum' and t.lexer.lexstate != 'function': 0268 t.lexer.begin ('enum') 0269 elif t.value in CppLexerClass.cppScalarTypes: 0270 t.type = t.value 0271 elif t.value in CppLexerClass.accessSpecifiers: 0272 t.type = t.value 0273 elif t.value in CppLexerClass.storageQualifiers: 0274 t.type = "STORAGE" 0275 elif t.value in CppLexerClass.functionQualifiers: 0276 t.type = t.value 0277 elif t.value in CppLexerClass.cvQualifiers: 0278 t.type = "CVQUAL" 0279 elif t.value in self._bareMacros: 0280 t.type = "BAREMACRO" 0281 elif t.value in self._macros: 0282 t.type = "MACROCALL" 0283 t.lexer.begin('macro') 0284 return t 0285 0286 0287 # Capture inline documentation 0288 def t_enum_BackDO2COMMENT(self,t): 0289 r'/\*\*\<(.|\n)*?\*/' 0290 t.lexer.lineno += t.value.count ('\n') 0291 t.type = 'UPDOC' 0292 return t 0293 0294 def t_enum_BackDO2CPPCOMMENT(self,t): 0295 r'///\<[^\n]*\n' 0296 t.lexer.lineno += t.value.count ('\n') 0297 if t.lexer.lexstate == 'enum': 0298 t.type = 'UPDOC' 0299 return t 0300 0301 def t_ANY_DO2COMMENT(self,t): 0302 r'/\*\*(.|\n)*?\*/' 0303 t.lexer.lineno += t.value.count ('\n') 0304 t.type = 'DOC' 0305 return t 0306 0307 def t_ANY_DO2CPPCOMMENT(self,t): 0308 r'///[^\n]*\n' 0309 t.lexer.lineno += t.value.count ('\n') 0310 if t.lexer.lexstate == 'enum': 0311 t.type = 'DOC' 0312 return t 0313 0314 # Comments 0315 def t_ANY_comment(self,t): 0316 r' /\*(.|\n)*?\*/' 0317 t.lexer.lineno += t.value.count('\n') 0318 0319 def t_ANY_ignore_cppcomment(self,t): 0320 r'//[^\n]*\n' 0321 t.lexer.lineno += t.value.count ('\n') 0322 0323 # Preprocessor directive (ignored) 0324 # def t_preprocessor(t): 0325 # r'\#(.)*?\n' 0326 # if t.value.endswith ('\\\n'): 0327 # data = t.lexer.lexdata [t.lexer.lexpos] 0328 # for i in range (t.lexer.lexpos, len (data)): 0329 # if data [i] == '\n': 0330 # t.lineno += 1 0331 # if data [i] == '\n' and data [i - 1] != '\\': 0332 # break 0333 # t.lexer.lexpos = i + 1 0334 # t.lineno += 1 0335 0336 def t_ANY_error(self,t): 0337 print("Illegal character %s" % repr(t.value[0])) 0338 t.lexer.skip(1) 0339 0340 tokens = CppLexerClass.tokens 0341 0342 def CppLexer(): 0343 lexerClass = CppLexerClass() 0344 lexer = lex.lex(object=lexerClass) 0345 lexer.lexmodule = lexerClass 0346 return lexer 0347