File indexing completed on 2024-07-14 15:05:09

0001 # -*- coding: utf-8 -*-
0002 #     Copyright 2007-8 Jim Bublitz <jbublitz@nwinternet.com>
0003 #     Copyright 2009 Simon Edwards <simon@simonzone.com>
0004 #
0005 # This program is free software; you can redistribute it and/or modify
0006 # it under the terms of the GNU General Public License as published by
0007 # the Free Software Foundation; either version 2 of the License, or
0008 # (at your option) any later version.
0009 #
0010 # This program is distributed in the hope that it will be useful,
0011 # but WITHOUT ANY WARRANTY; without even the implied warranty of
0012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0013 # GNU General Public License for more details.
0014 #
0015 # You should have received a copy of the GNU General Public License
0016 # along with this program; if not, write to the
0017 # Free Software Foundation, Inc.,
0018 # 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
0019 
0020 import ply.lex as lex
0021 
0022 class CppLexerClass(object):
0023     def __init__(self):
0024         self.codeStack = []
0025         self.parenStack = []
0026         self._bareMacros = []
0027         self._macros = []
0028 
0029     def setBareMacros(self,macroList):
0030         # macroList - List of strings
0031         self._bareMacros = macroList
0032 
0033     def setMacros(self,macroList):
0034         # macroList - List of strings
0035         self._macros = macroList
0036 
0037     states = (('function', 'inclusive'), ('macro', 'exclusive'), ('operator', 'inclusive'), ('variable', 'inclusive'),\
0038             ('stmt', 'exclusive'), ('enum', 'inclusive'))
0039 
0040     accessSpecifiers = ("private", "protected", "public", "slots", "signals")
0041 
0042     edges = ("class", "struct", "union",  "template", "enum", "namespace",\
0043             "typedef", "operator", 'typename')
0044 
0045     storageQualifiers = ("auto", "register", "static", "extern", "mutable")
0046 
0047     functionQualifiers = ("virtual",  "explicit")
0048 
0049     cvQualifiers = ("const", "volatile")
0050 
0051     cppScalarTypes = ("int", "char", "float", "double", "long", "short", "unsigned",\
0052         "signed", "bool", "void", "wchar_t")        
0053 
0054     # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=)
0055     operators = ('PLUS', 'MINUS', 'SLASH', 'PERCENT', 'VBAR', 'CARET', #'LSHIFT', 'RSHIFT',
0056         'LOR', 'LAND', 'BANG', 'LE', 'GE', 'EQ', 'NE',
0057         # Increment/decrement (++,--)
0058         'PLUSPLUS', 'MINUSMINUS',
0059         # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=)
0060         'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL',
0061         'LSHIFTEQUAL',#'RSHIFTEQUAL', 
0062         'ANDEQUAL', 'XOREQUAL', 'OREQUAL', 'new', 'static_cast'
0063     )
0064     
0065     doc = ('DOC', 'UPDOC')
0066 
0067     tokens = accessSpecifiers + edges + functionQualifiers + cppScalarTypes + operators + doc + (
0068     # Literals (identifier, integer constant, float constant, string constant, char const)
0069     'ID', 'ICONST', 'HEXCONST', 'FCONST', 'SCONST', 'CCONST', 'CVQUAL',
0070     'STORAGE', 'PURESFX', 'CODE_TOKEN', 'STMT_BEGIN', 'STMT_END', 'CODE_STMT_BEGIN',
0071 
0072     # Expressions we don't parse
0073     'ARRAYOP', 'FUNCPTR', 'BAREMACRO', 'MACROCALL', 'MACRO_ELEMENT', 'MACRO_CALL_BEGIN', 'MACRO_CALL_END',
0074 
0075     # Structure dereference (->)
0076     'ARROW',
0077 
0078     # Treat separately from other operators
0079     'EQUALS', 'ASTERISK', 'AMPERSAND', 'TILDE', 'LT', 'GT',
0080 
0081     # Delimeters ( ) [ ] { } , . ; : ::
0082     'LPAREN', 'RPAREN',
0083     #'LBRACKET', 'RBRACKET',
0084     'LBRACE', 'RBRACE',
0085     'COMMA', 'PERIOD', 'SEMI', 'COLON', 'COLON2',
0086 
0087     # Ellipsis (...)
0088     'ELLIPSIS'
0089     )
0090 
0091     # Completely ignored characters
0092     t_ANY_ignore          = ' \t\x0c'
0093     #t_ANY_ignore_typename = 'typename'
0094 
0095     # Operators
0096     t_PLUS             = r'\+'
0097     t_MINUS            = r'-'
0098     t_ASTERISK         = r'\*'
0099     t_SLASH            = r'/'
0100     t_PERCENT          = r'%'
0101     t_VBAR             = r'\|'
0102     t_AMPERSAND        = r'&'
0103     t_TILDE            = r'~'
0104     t_CARET            = r'\^'
0105     #t_LSHIFT           = r'<<'
0106     #t_RSHIFT           = r'>>'
0107     t_LOR              = r'\|\|'
0108     t_LAND             = r'&&'
0109     t_BANG             = r'!'
0110     t_LT               = r'<'
0111     t_GT               = r'>'
0112     t_LE               = r'<='
0113     t_GE               = r'>='
0114     t_EQ               = r'=='
0115     t_NE               = r'!='
0116 
0117     # Assignment operators
0118 
0119     t_EQUALS           = r'='
0120     t_TIMESEQUAL       = r'\*='
0121     t_DIVEQUAL         = r'/='
0122     t_MODEQUAL         = r'%='
0123     t_PLUSEQUAL        = r'\+='
0124     t_MINUSEQUAL       = r'-='
0125     t_LSHIFTEQUAL      = r'<<='
0126     #t_RSHIFTEQUAL      = r'>>='
0127     t_ANDEQUAL         = r'&='
0128     t_OREQUAL          = r'\|='
0129     t_XOREQUAL         = r'^='
0130 
0131     # Increment/decrement
0132     t_PLUSPLUS         = r'\+\+'
0133     t_MINUSMINUS       = r'--'
0134 
0135     # ->
0136     t_ARROW            = r'->'
0137 
0138     # Delimeters
0139     t_LPAREN           = r'\('
0140 
0141     def t_macro_LPAREN(self,t):
0142         r'\('
0143         if not self.parenStack:
0144             t.type = 'MACRO_CALL_BEGIN'
0145             self.parenStack.append (True)
0146         else:
0147             t.type = 'LPAREN'
0148             self.parenStack.append (False)
0149             
0150         return t
0151 
0152     t_RPAREN           = r'\)'
0153 
0154     def t_macro_RPAREN(self,t):
0155         r'\)'
0156         if self.parenStack.pop ():
0157             t.type = 'MACRO_CALL_END'
0158             t.lexer.begin ('variable')
0159         else:
0160             t.type = 'RPAREN'
0161         return t
0162         
0163     def t_macro_MACRO_ELEMENT(self,t):
0164         r'[^\(\)\s]+'
0165         t.lexer.lineno += t.value.count ('\n')
0166         return t
0167 
0168     #t_LBRACKET         = r'\['
0169     #t_RBRACKET         = r'\]'
0170 
0171     def t_ANY_LBRACE(self,t):
0172         r'\{'
0173         if t.lexer.lexstate in ['operator', 'function']:
0174             t.type = 'STMT_BEGIN'
0175             self.codeStack.append (t.lexer.lexstate)
0176             t.lexer.begin ('stmt')
0177         elif t.lexer.lexstate == 'stmt':
0178             t.type = 'CODE_STMT_BEGIN'
0179             self.codeStack.append ('stmt')
0180         else:
0181             t.type = 'LBRACE'
0182         return t
0183             
0184     def t_ANY_RBRACE(self,t):
0185         r'\}'
0186         if t.lexer.lexstate in ['stmt', 'function', 'operator']:
0187             if self.codeStack:
0188                 t.lexer.begin(self.codeStack.pop ())
0189                 t.type = 'STMT_END'
0190             else:
0191                 t.type = 'RBRACE'
0192         else:
0193             t.type = 'RBRACE'
0194         return t
0195 
0196     def t_stmt_CODE_TOKEN(self,t):
0197         r'[^{}\s]+'
0198         if '\n' in t.value:
0199             t.lexer.lineno += t.value.count ('\n')
0200         return t
0201 
0202     t_COMMA            = r','
0203     t_PERIOD           = r'\.'
0204     t_SEMI             = r';'
0205     t_COLON            = r':'
0206     t_ELLIPSIS         = r'\.\.\.'
0207     t_COLON2           = r'::'
0208 
0209     # Hex Literal
0210     def t_HEXCONST(self,t):
0211         r'0[x|X][\da-fA-F]+'
0212         t.type = 'HEXCONST'
0213         return t
0214 
0215     # Integer literal
0216     t_ICONST = r'\d+([uUlL])?([uUlL])?' #r'(0(?![x|X])|[1-9])\d*([uU]|[lL]|[uU][lL]|[lL][uU])?'
0217 
0218     # Floating literal
0219     t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
0220 
0221     t_function_PURESFX = r'0;'
0222 
0223     t_operator_PURESFX = r'0;'
0224 
0225     # Array operator
0226     t_ARRAYOP = r'[[].*[]]'
0227 
0228     # Function pointer
0229     t_FUNCPTR  = r'\(\s*\*'
0230 
0231     # String literal
0232     t_SCONST = r'\"([^\\\n]|(\\.))*?\"'
0233 
0234     # Character constant 'c' or L'c'
0235     t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\''
0236     # Newlines
0237 
0238     # some things we ignore (entire line)
0239     def t_friend_class(self,t):
0240         r'friend\s+class\s+[^;]*;?'
0241         t.lexer.lineno += t.value.count("\n")
0242 
0243     def t_friend(self,t):
0244         r'friend\s'
0245         pass    
0246 
0247     def t_using(self,t):
0248         r'using\s+.*;?'
0249         t.lexer.lineno += t.value.count ('\n')
0250 
0251     def t_inline(self,t):
0252         r'inline\s+'
0253 
0254     def t_ANY_NEWLINE(self,t):
0255         r'(\\\n|\n)+'
0256         t.lexer.lineno += t.value.count("\n")
0257             
0258     def t_ID(self,t):
0259         r'[A-Za-z_][\w_]*'
0260         if t.value in ['class', 'namespace']:
0261             t.lexer.begin ('variable')
0262             
0263         if t.value in CppLexerClass.edges or t.value in ['new', 'static_cast']:
0264             t.type = t.value
0265             if t.type == 'operator':
0266                 t.lexer.begin ('operator')
0267             if t.type == 'enum' and t.lexer.lexstate != 'function':
0268                 t.lexer.begin ('enum')
0269         elif t.value in CppLexerClass.cppScalarTypes:
0270             t.type = t.value
0271         elif t.value in CppLexerClass.accessSpecifiers:
0272             t.type = t.value
0273         elif t.value in CppLexerClass.storageQualifiers:
0274             t.type = "STORAGE"
0275         elif t.value in CppLexerClass.functionQualifiers:
0276             t.type = t.value
0277         elif t.value in CppLexerClass.cvQualifiers:
0278             t.type = "CVQUAL"
0279         elif t.value in self._bareMacros:
0280             t.type = "BAREMACRO"
0281         elif t.value in self._macros:
0282             t.type = "MACROCALL"
0283             t.lexer.begin('macro')
0284         return t
0285 
0286 
0287     # Capture inline documentation
0288     def t_enum_BackDO2COMMENT(self,t):
0289         r'/\*\*\<(.|\n)*?\*/'
0290         t.lexer.lineno += t.value.count ('\n')
0291         t.type = 'UPDOC'
0292         return t
0293 
0294     def t_enum_BackDO2CPPCOMMENT(self,t):
0295         r'///\<[^\n]*\n'
0296         t.lexer.lineno += t.value.count ('\n')
0297         if t.lexer.lexstate == 'enum':
0298             t.type = 'UPDOC'        
0299             return t
0300 
0301     def t_ANY_DO2COMMENT(self,t):
0302         r'/\*\*(.|\n)*?\*/'
0303         t.lexer.lineno += t.value.count ('\n')
0304         t.type = 'DOC'
0305         return t
0306 
0307     def t_ANY_DO2CPPCOMMENT(self,t):
0308         r'///[^\n]*\n'
0309         t.lexer.lineno += t.value.count ('\n')
0310         if t.lexer.lexstate == 'enum':
0311             t.type = 'DOC'        
0312             return t
0313         
0314     # Comments
0315     def t_ANY_comment(self,t):
0316         r' /\*(.|\n)*?\*/'
0317         t.lexer.lineno += t.value.count('\n')
0318             
0319     def t_ANY_ignore_cppcomment(self,t):
0320         r'//[^\n]*\n'
0321         t.lexer.lineno += t.value.count ('\n')
0322 
0323     # Preprocessor directive (ignored)
0324     # def t_preprocessor(t):
0325     #     r'\#(.)*?\n'
0326     #     if t.value.endswith ('\\\n'):
0327     #         data = t.lexer.lexdata  [t.lexer.lexpos]
0328     #         for i in range (t.lexer.lexpos, len (data)):
0329     #             if data [i] == '\n':
0330     #                 t.lineno += 1
0331     #             if data [i] == '\n' and data [i - 1] != '\\':
0332     #                 break
0333     #         t.lexer.lexpos = i + 1
0334     #     t.lineno += 1
0335 
0336     def t_ANY_error(self,t):
0337         print("Illegal character %s" % repr(t.value[0]))
0338         t.lexer.skip(1)
0339 
0340 tokens = CppLexerClass.tokens
0341 
0342 def CppLexer():
0343     lexerClass = CppLexerClass()
0344     lexer = lex.lex(object=lexerClass)
0345     lexer.lexmodule = lexerClass
0346     return lexer
0347