documentation_src/introspection/introspect.py

0001 #!/usr/bin/env python
0002 # -*- Coding:utf-8 -*-
0003
0004 # SPDX-FileCopyrightText: 2013 Sven Brauch <svenbrauch@googlemail.com>
0005 # SPDX-License-Identifier: GPL-2.0-or-later
0006 # The script output is not copyrighted, use it for whatever you want.
0007
0008 # WARNING: This script does things which can cause bad stuff to happen
0009 # to your system in case you execute it on a module which has been
0010 # engineered to be mailicious.
0011 # I thus recommend to run this script as a user with minimal privileges
0012 # (i.e. not as yourself), or even in a chroot.
0013 # In any case, I'm not responsible for any damage caused by this script.
0014
0015 # This script will dump a SINGLE MODULE to a python "header" file.
0016 # It will read one module, and give you one output file.
0017 # Any submodules of the imported object (anything with type "module")
0018 # will be ignored by this script, if you want to dump those,
0019 # you will have to manually (or with a script) generate a directory
0020 # structure and re-run this script.
0021 import re
0022 import traceback
0023
0024 import sys
0025 import types
0026 import inspect
0027 import importlib
0028 import builtins
0029
0030 def debugmsg(message):
0031     sys.stderr.write(message + "\n")
0032     sys.stderr.flush()
0033
0034 def structseq_to_py(seq, name="INSERT_NAME"):
0035     """Turns a "structseq" object to a python pseudoclass."""
0036     sseq = str(seq)
0037     sseq = '('.join(sseq.split('(')[1:])
0038     sseq = ')'.join(sseq.split(')')[:1])
0039     print("class {0}:".format(name))
0040     for item in sseq.split(','):
0041         item = item.strip()
0042         key, value = item.split('=')
0043         print(indent("{0} = {1}".format(key, value)))
0044
0045 def indent(code, depth=4):
0046     code = code.split('\n')
0047     code = [" "*depth + line for line in code]
0048     return '\n'.join(code)
0049
0050 def clearIndent(code):
0051     assert isinstance(code, str)
0052     code = code.split('\n')
0053     code = [line.strip() for line in code]
0054     return '\n'.join(code)
0055
0056 def syntaxValid(code):
0057     try:
0058         compile(code, "<no file>", 'exec')
0059     except SyntaxError:
0060         return False
0061     return True
0062
0063 def sanitize(expr):
0064     assert isinstance(expr, str)
0065     replace = {
0066         '*':'', '[':'', ']':'',
0067         'from':'_from', 'class':'_class', '-':'_', 'lambda':'_lambda', "raise":"_raise",
0068         '\\':'ESC', ' ':'', "<":'"', ">":'"', "self,":"", "self":"",
0069         ",,":",", '...':'more_args', '+':"plus"
0070     }
0071     result = expr
0072     for before, after in replace.items():
0073         result = result.replace(before, after)
0074     result = re.sub(r"\.\d", "_", result)
0075     result = result.replace("=,", "=[],").replace("=)", "=[])")
0076     return result
0077
0078 def strict_sanitize(expr):
0079     assert isinstance(expr, str)
0080     expr = sanitize(expr)
0081     forbidden = ["=()", '(', ')', '"', "'", " ", ",", "|", "%", '#', '{', '}']
0082     for char in forbidden:
0083         expr = expr.replace(char, "")
0084     if len(expr) == 0:
0085         expr = "_"
0086     if expr[-1] == '.':
0087         expr = expr[:-1]
0088     if expr == ".":
0089         return "None"
0090     if len(expr) > 0 and expr[0].isdigit():
0091         expr = "_" + expr
0092     return expr
0093
0094 def isSpace(char):
0095     return char == ' ' or char == '\t'
0096
0097 def removeAtCorner(string, char, direction):
0098     i = 0
0099     assert direction in ['<', '>']
0100     if direction == '>':
0101         iterator = range(0, len(string))
0102         def r(s, a): return s[a-1:]
0103     if direction == '<':
0104         iterator = [len(string)-x-1 for x in range(0, len(string))]
0105         def r(s, a): return s[:a+1]
0106
0107     atBeginning = True
0108     for i in iterator:
0109         if isSpace(string[i]) and atBeginning:
0110             continue
0111         elif string[i] == char:
0112             atBeginning = False
0113         else:
0114             return r(string, i)
0115     return str()
0116
0117 likely_substitutions = {
0118     "integer": "int",
0119     "string": "str",
0120     "long": "int",
0121     "dictionary": "dict",
0122     "double": "float",
0123     "scalar": "float",
0124     "array_like": "ndarray"
0125 }
0126
0127 def do_type_subst(t):
0128     if t in likely_substitutions:
0129         return likely_substitutions[t]
0130     return t
0131
0132 def get_indent(string):
0133     string = string.split("\n")[0]
0134     indent = 0
0135     for char in string:
0136         if char in [' ', '\t']:
0137             indent += 1
0138         else:
0139             return indent
0140     return 0
0141
0142 def remove_indent(string):
0143     if type(string) == str:
0144         string = string.split("\n")
0145         max_remove_indent = get_indent(string[0])
0146         result = ""
0147         for line in string:
0148             for offset in range(0, len(line)):
0149                 if line[offset] not in [' ', '\t'] or offset > max_remove_indent:
0150                     result += line[offset:] + "\n"
0151                     break
0152         return result
0153     else:
0154         return string
0155
0156 def guess_return_type_from_synopsis(synopsis, root):
0157     container = ""
0158     for item in re.finditer("return", synopsis, re.I):
0159         scan = synopsis[item.start():item.end()+60]
0160         def apply_container(value):
0161             if len(container) > 0:
0162                 return "{0}([{1}])".format(container, value)
0163             else:
0164                 return value
0165         if "ndarray" in scan.split() or 'array_like' in scan.split() or 'array_type' in scan.split():
0166             # hack to make "complex ndarray" work properly
0167             return "ndarray()"
0168         for word in scan.split():
0169             if word.find('.') != -1 and word != '...':
0170                 break # end of sentence -- stop
0171             word = word.replace(',', '')
0172             if word in ["none", "None"]:
0173                 return "None"
0174             if word in ["True", "False", "true", "false", "bool", "boolean"]:
0175                 return apply_container("bool()")
0176             if word in ["dict", "dictionary"]:
0177                 return "dict()"
0178             if word in ["string", "str", "represenation"]:
0179                 return "str()"
0180             if word in ["list", "iterable"]:
0181                 container = "list"
0182                 continue
0183             if word in ["set"]:
0184                 container = "set"
0185                 continue
0186             if word in ["number", "int", "integer"]:
0187                 return apply_container("int()")
0188             if word in ["float", "ratio", "fraction"]:
0189                 return apply_container("float()")
0190             if hasattr(root.module, word) and type(getattr(root.module, word) == type(object)):
0191                 return apply_container(word + "()")
0192             if word[-1] == "s" and hasattr(root.module, word[:-1]) and type(getattr(root.module, word[:-1]) == type(object)):
0193                 # plural form, "list of ints"
0194                 return apply_container(word[:-1] + "()")
0195             if hasattr(builtins, word) and type(getattr(builtins, word)) == type(object):
0196                 return apply_container(word + "()")
0197     if len(container) > 0:
0198         return container + "()"
0199     return "None"
0200
0201 def parse_numpy_like_docstring(docstring, funcname, root, needSelfArg=False):
0202     selflist = ["self"] if needSelfArg else []
0203     if type(docstring) == str:
0204         indent = 0
0205         atLineBeginning = True
0206         paramListBegin = paramListEnd = False
0207         returnTypeBegin = returnTypeEnd = False
0208         atPartBeginning = 2
0209         returnType = "None"
0210         for offset in range(0, len(docstring)):
0211             if docstring[offset] == "\n":
0212                 indent = 0
0213             if docstring[offset] in [' ', '\t'] and atLineBeginning:
0214                 indent += 1
0215             else:
0216                 atLineBeginning = False
0217
0218             if paramListEnd is False:
0219                 if docstring[offset:].startswith("Parameters"):
0220                     paramListBegin = offset
0221                 if paramListBegin is not False and docstring[offset] == "\n" and atPartBeginning != 0:
0222                     atPartBeginning -= 1
0223                 if docstring[offset:].startswith("---") and atPartBeginning == 0:
0224                     paramListEnd = offset
0225                     break
0226             if returnTypeEnd == False:
0227                 if docstring[offset:].startswith("Returns"):
0228                     returnTypeBegin = offset
0229         relevantPart = docstring[paramListBegin:paramListEnd].split("\n")[2:]
0230         if returnTypeBegin is not False:
0231             try:
0232                 line = docstring[returnTypeBegin:].split('\n')[2]
0233                 ret = line.split(' : ')[1]
0234                 if ret.find(' or ') != -1:
0235                     # unsure return type
0236                     returnTypes = map(strict_sanitize, [item.split(' ')[0] for item in ret.split(' or ')])
0237                     returnType = ''.join(["{0}() if False else ".format(do_type_subst(t)) for t in returnTypes[:-1]]) \
0238                                  + do_type_subst(str(returnTypes[-1])) + "()"
0239                 else:
0240                     if 'ndarray' in ret.split() or 'array_like' in ret.split() or 'array_type' in ret.split():
0241                         returnType = "ndarray()"
0242                     else:
0243                         returnTypeLine = ret.split(' ')[0].split(',')[0]
0244                         returnType = do_type_subst(strict_sanitize(returnTypeLine)) + "()"
0245             except IndexError:
0246                 returnType = guess_return_type_from_synopsis(docstring[returnTypeBegin:], root)
0247         if len(relevantPart):
0248             firstIndent = get_indent(relevantPart[0])
0249             parameter_name_list = []
0250             for line_index in range(0, len(relevantPart)):
0251                 if get_indent(relevantPart[line_index]) == firstIndent:
0252                     s = relevantPart[line_index].split(' : ')
0253                     if len(s) == 2:
0254                         name = s[0]
0255                         type_string = s[1]
0256                         doc_for_param = None # TODO extract this, and display it in some way... or not
0257                         parameter_name = strict_sanitize(name)
0258                         if parameter_name.find('...') != -1:
0259                             parameter_name = 'more'
0260                         parameter_name = parameter_name.replace('`', '')
0261                         parameter_name_list.append(parameter_name)
0262             return ', '.join(selflist + parameter_name_list), do_type_subst(returnType)
0263         else:
0264             try:
0265                 firstType = docstring.split("\n")[0].split('.')[-1]
0266                 if firstType.find(funcname) == -1:
0267                     raise IndexError()
0268                 firstType = firstType.split('->')[0]
0269                 firstType = firstType.split('(')[1:]
0270                 firstType = ')'.join('('.join(firstType).split(')')[:-1])
0271                 paramList = firstType.split(',')
0272                 cleanedParamList = []
0273                 for item in paramList:
0274                     if item.find('...') == -1:
0275                         cleanedParamList.append(item)
0276                 return ', '.join(selflist + [strict_sanitize(x) for x in cleanedParamList]), "None"
0277             except IndexError:
0278                 return "self" if needSelfArg else "", "None"
0279     else:
0280         return "self" if needSelfArg else "", "None"
0281
0282 def parse_synopsis(funcdef, original, root, needSelfArg=False):
0283     """Parse a function description in the following format:
0284     module.func(param1, param2, [optional_param1 = default1, [optional_param2 = default2]]) -> return_type
0285     This tries to be as error-prone as possible in order to convert everything into a valid parameter list."""
0286     # first, take the parts before and after the arrow:
0287     assert isinstance(funcdef, str)
0288     funcdef = funcdef.replace("<==>", " -> ")
0289     s = funcdef.split(' -> ')
0290     definition = s[0]
0291     returnType = s[1] if len(s) > 1 else "None"
0292     # Sometimes, people do fancy stuff in the return type, like "... -> ndarray or None if arg is False"
0293     # Thus, we only use the first word... well.
0294     returnType = strict_sanitize(returnType.split(' ')[0])
0295     if returnType in likely_substitutions:
0296         returnType = likely_substitutions[returnType]
0297     if returnType != 'None':
0298         returnType += "()"
0299     if returnType == 'None' or returnType == '_()':
0300         returnType = guess_return_type_from_synopsis(original, root)
0301     # Okay, now the fun part: parse the parameter list
0302     inParamList = False
0303     brackets = 0
0304     paramList = ""
0305     for char in definition:
0306         if char == '(' and not inParamList:
0307             inParamList = True
0308         elif char == '(':
0309             brackets += 1
0310         if char == ')' and brackets > 0:
0311             brackets -= 1
0312         elif char == ')' and inParamList:
0313             break
0314         if inParamList and char not in '()':
0315             paramList += char
0316     paramList = paramList.split(',')
0317     resultingParamList = []
0318     atDefault = False
0319     for param in paramList:
0320         defaultValue = None
0321         # extract the name of the param
0322         param = param.replace(' ', '').replace('\t', '')
0323         # check for default values
0324         if removeAtCorner(param, '[', '>') != removeAtCorner(param, ' ', '>') or param.find('=') != -1:
0325             # default parameter list starts  or continues with this parameter
0326             atDefault = True
0327         if atDefault:
0328             defaultValue = "None"
0329             if param.find('=') != -1:
0330                 # default value was provided; clean trailing "[" and "]" chars
0331                 defaultValue = removeAtCorner(removeAtCorner(param.split('=')[1], ']', '<'), '[', '<')
0332                 param = param.split('=')[0]
0333             if len(str(defaultValue)) == 0 or str(defaultValue).isspace():
0334                 # just write anything, otherwise it's syntactically invalid
0335                 defaultValue = "None"
0336         if removeAtCorner(param, '[', '<') != removeAtCorner(param, ' ', '<'):
0337             # default parameter list starts or continues after this parameter
0338             atDefault = True
0339         param = strict_sanitize(param)
0340         if param == '':
0341             continue
0342         if defaultValue:
0343             resultingParamList.append("{0}={1}".format(param, sanitize(defaultValue)))
0344         else:
0345             resultingParamList.append(param)
0346     if needSelfArg:
0347         # we're in a class, make sure there's a "self"
0348         if len(resultingParamList) == 0 or ( [resultingParamList[0].find(x) for x in ["self", "cls"]] == [-1, -1] ):
0349             resultingParamList.insert(0, "self")
0350     return ', '.join(resultingParamList), returnType
0351
0352
0353 class ModuleDumper:
0354     def __init__(self, module, startIndent=0, special_hints=dict()):
0355         self.module = module
0356         self.code = str()
0357         self.indentDepth = startIndent
0358         self.special_hints = special_hints
0359
0360     def increaseIndent(self):
0361         self.indentDepth += 4
0362
0363     def decreaseIndent(self):
0364         self.indentDepth -= 4
0365         if self.indentDepth < 0:
0366             self.indentDepth = 0
0367
0368     def emit(self, code):
0369         print(indent(code, self.indentDepth))
0370
0371     def dump(self):
0372         debugmsg("Processing module {0}".format(self.module.__name__))
0373         for member, value in inspect.getmembers(self.module):
0374             dumper = dumperForObject(value, member, self)
0375             dumper.dump()
0376
0377 class ScalarDumper:
0378     def __init__(self, name, value, root):
0379         self.name = name
0380         self.value = value
0381         self.root = root
0382
0383     def dump(self):
0384         value = type(self.value).__name__ + "()" if self.value is not None else "None"
0385         if value == 'module()':
0386             # numpy fix
0387             return
0388         self.root.emit("{0} = {1}".format(self.name, value))
0389
0390 def pick_better_return_value(v1, v2):
0391     if v1 == "None":
0392         return v1
0393     return v2
0394
0395 def pick_better_arglist(s1, s2):
0396     # return the one with more arguments
0397     if s1.count(',') > s2.count(','):
0398         return s1
0399     return s2
0400
0401 goodValues = [True, False, None]
0402 goodTypes = map(type, [int(), float()])
0403
0404 class FunctionDumper:
0405     def __init__(self, function, root):
0406         self.function = function
0407         self.root = root
0408         assert isinstance(self.root, ModuleDumper)
0409
0410     def dump(self):
0411         try:
0412             arguments = inspect.getfullargspec(self.function)
0413             arglist = list()
0414             for index, argument in enumerate(arguments.args):
0415                 if len(arguments.args) - index - 1 > len(arguments.defaults):
0416                     # no default value -> normal argument
0417                     arglist.append(argument)
0418                 else:
0419                     # there's a default value
0420                     defaultIndex = index - (len(arguments.args) - len(arguments.defaults))
0421                     rawDefaultValue = arguments.defaults[defaultIndex]
0422                     if type(rawDefaultValue) == type(object):
0423                         defaultValue = strict_sanitize(str(rawDefaultValue)) + "()"
0424                     elif rawDefaultValue in goodValues or type(rawDefaultValue) in goodTypes:
0425                         defaultValue = str(rawDefaultValue)
0426                     else:
0427                         defaultValue = '"{0}"'.format(str(rawDefaultValue).replace("\n", " "))
0428                     if len(defaultValue) == 0 or defaultValue.isspace():
0429                         defaultValue = "None"
0430                     arglist.append("{0}={1}".format(argument, defaultValue))
0431             if self.root.indentDepth > 0:
0432                 # we're in a class, make sure there's a "self"
0433                 if len(arglist) == 0 or ( arglist[0].find("self") == -1 and arglist[0].find("cls") == -1 ):
0434                     arglist.insert(0, "self")
0435             arglist = ', '.join(arglist)
0436         except TypeError:
0437             # not a python function, can't inspect it. try guessing argspec from docstring
0438             arglist = None
0439         try:
0440             docstring = self.function.__doc__.split('\n')[0] if self.function.__doc__ else str()
0441             try:
0442                 synArglist, returnValue = parse_synopsis(docstring, str(self.function.__doc__), self.root,
0443                                                          self.root.indentDepth > 0)
0444             except Exception as e:
0445                 debugmsg(format(e))
0446             try:
0447                 synArglist2, returnValue2 = parse_numpy_like_docstring(str(self.function.__doc__),
0448                                                                        self.function.__name__, self.root,
0449                                                                        self.root.indentDepth > 0)
0450             except Exception as e:
0451                 debugmsg(format(e))
0452             synArglist = pick_better_arglist(synArglist, synArglist2)
0453             returnValue = pick_better_return_value(returnValue, returnValue2)
0454         except Exception as e:
0455             debugmsg("  Warning: Function argument extraction failed: {0}".format(e))
0456             debugmsg("   * Traceback follows, but the error was ignored since it is not fatal.")
0457             traceback.print_exc(file=sys.stderr)
0458             synArglist = ""
0459             returnValue = "None"
0460         if docstring.find("Not implemented (virtual attribute)") != -1:
0461             # numpy hack
0462             return
0463         if arglist is None:
0464             arglist = synArglist
0465         try:
0466             funcname = self.function.__name__
0467         except:
0468             return
0469         if funcname in self.root.special_hints:
0470             hints = self.root.special_hints[funcname]
0471             if "returns" in hints:
0472                 returnValue = hints["returns"]
0473         if funcname[0].isdigit():
0474             funcname = '_' + funcname
0475         if funcname.startswith('__'):
0476             return
0477         self.root.emit("def {0}({1}):".format(strict_sanitize(funcname), arglist))
0478         self.root.increaseIndent()
0479         self.root.emit('"""{0}"""'.format(str(self.function.__doc__).replace('"""', '___')))
0480         self.root.emit("return {0}".format(returnValue))
0481         self.root.decreaseIndent()
0482
0483 class ClassDumper:
0484     def __init__(self, klass, root):
0485         self.klass = klass
0486         self.root = root
0487         assert isinstance(self.root, ModuleDumper)
0488
0489     def dump(self):
0490         debugmsg("Generating documentation for class {0}".format(self.klass.__name__))
0491         self.root.emit("class {0}:".format(self.klass.__name__))
0492         self.root.increaseIndent()
0493         for member, value in inspect.getmembers(self.klass):
0494             if type(value) == type:
0495                 continue
0496             dumper = dumperForObject(value, member, self.root)
0497             dumper.dump()
0498         self.root.decreaseIndent()
0499
0500 dumpers = {
0501     types.FunctionType: FunctionDumper,
0502     types.BuiltinFunctionType: FunctionDumper,
0503     types.BuiltinMethodType: FunctionDumper,
0504     type: ClassDumper
0505 }
0506 try:
0507     dumpers[types.ClassType] = ClassDumper # python 2
0508 except:
0509     pass
0510
0511 def dumperForObject(object, memberName, root):
0512     try:
0513         return dumpers[type(object)](object, root)
0514     except:
0515         if hasattr(object, "__call__"):
0516             return FunctionDumper(object, root);
0517         return ScalarDumper(memberName, object, root)
0518
0519 if __name__ == '__main__':
0520     try:
0521         argscount = len(sys.argv)
0522         for arg in range(1, argscount-2):
0523             sys.path.insert(1, sys.argv[arg])
0524         dumper = ModuleDumper(importlib.import_module(sys.argv[-1]))
0525     except IndexError:
0526         debugmsg("Usage: introspect.py <python_module_name>")
0527         exit(1)
0528     dumper.dump()
0529     debugmsg("All done -- looks good so far.")