devtools/scripts/profileOfficeFileLoading.py

0001 #! /usr/bin/python -Qwarnall
0002
0003 # This script profiles loading of documents in Calligra.
0004 # It is called like this:
0005 #  profileOfficeFileLoading.py $dir $outputfile
0006 # The script outputs an csv file that contains the times various functions took
0007 # for each file in directory $dir. The directory $dir is scanned recursively.
0008 #
0009 # Copyright 2010 Jos van den Oever <jos@vandenoever.info>
0010
0011 import sys, os, tempfile, time, signal, subprocess, re, lxml.etree, zipfile
0012
0013 applications = {
0014   'calligrawords': ['odt', 'doc', 'docx'],
0015   'calligrastage': ['odp', 'ppt', 'pptx'],
0016   'calligrasheets': ['ods', 'xls', 'xlsx']
0017 }
0018 extensions = {
0019   'odt': 'odt',
0020   'doc': 'odt',
0021   'docx': 'odt',
0022   'odp': 'odp',
0023   'ppt': 'odp',
0024   'pptx': 'odp',
0025   'ods': 'ods',
0026   'xls': 'ods',
0027   'xlsx': 'ods'
0028 }
0029
0030 # limit how many backtraces are recordes, since it takes a lot of time
0031 maxbacktraces = 50
0032
0033 def scanDirectory(rootdir, extensions):
0034     if os.path.isfile(rootdir):
0035         return [rootdir]
0036
0037     filext = map(lambda e: "." + e, extensions)
0038     list = []
0039     for root, directories, files in os.walk(rootdir):
0040         for file in files:
0041             if file.startswith('.'):
0042                 continue
0043             if any(map(lambda e: file.endswith(e), filext)):
0044                 list.append(os.path.join(root, file))
0045     return frozenset(list)
0046
0047 def getExtensions(list):
0048     extensions = []
0049     for file in list:
0050         (path, ext) = os.path.splitext(file)
0051         extensions.append(ext)
0052     return frozenset(extensions)
0053
0054 class object:
0055     def __init__(self):
0056         pass
0057
0058 # functions for logging, in this case to TeamCity
0059 class logger:
0060     def __init__(self):
0061         self.suitename = None
0062     def escape(self, text):
0063         def escape(m):
0064             if m.group(0) == '\n':
0065                 return '|n'
0066             if m.group(0) == '\r':
0067                 return '|r'
0068             return '|'+m.group(0)
0069         return re.sub('[\'\n\r\|\]]', escape, text)
0070     def startTestSuite(self, name):
0071         self.suitename = name
0072         print "##teamcity[testSuiteStarted name='" + self.suitename \
0073             + "']"
0074         sys.stdout.flush()
0075     def endTestSuite(self):
0076         if not self.suitename: return
0077         print "##teamcity[testSuiteFinished name='" + self.suitename \
0078             + "']"
0079         sys.stdout.flush()
0080         self.suitename = None
0081     def startTest(self, name):
0082         if not self.suitename: return
0083         self.testname = name
0084         print "##teamcity[testStarted name='" + self.testname + "']"
0085         sys.stdout.flush()
0086     # fail the current test
0087     def failTest(self, backtrace):
0088         if not self.suitename or not self.testname: return
0089         bt = ''
0090         for l in backtrace:
0091             bt = bt + self.escape(l)
0092         print "##teamcity[testFailed name='" + self.testname \
0093             + "' details='" + bt + "']"
0094         sys.stdout.flush()
0095     # end test, pass duration as integer representing the milliseconds
0096     def endTest(self, duration):
0097         if not self.suitename or not self.testname: return
0098         print "##teamcity[testFinished name='" + self.testname \
0099             + "' duration='" + str(duration) + "']"
0100         sys.stdout.flush()
0101         self.testname = None
0102
0103 def containsRealError(err):
0104     # some errors reported by libxml2 are false positives, we filter them
0105     # out
0106     if str(err).find("ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element styles failed to validate content") != -1:
0107         return None
0108     if str(err).find("ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element automatic-styles failed to validate content") != -1:
0109         return None
0110     return err
0111
0112 class odfvalidator:
0113     def __init__(self):
0114         path = sys.path[0]
0115         self.relaxNGValidator = lxml.etree.RelaxNG( \
0116                 lxml.etree.parse(open(os.path.join(path, \
0117                 'OpenDocument-v1.2-cs01-schema-calligra.rng'),
0118                 'r')))
0119         self.relaxNGManifextValidator = lxml.etree.RelaxNG( \
0120                 lxml.etree.parse(open(os.path.join(path, \
0121                 'OpenDocument-v1.2-cs01-manifest-schema.rng'), \
0122                 'r')))
0123     # returns error string on error, None otherwise
0124     def validate(self, odtpath):
0125         try:
0126             zip = zipfile.ZipFile(odtpath, 'r')
0127         except zipfile.BadZipfile as e:
0128             return str(e)
0129         err = self.validateFile(zip, 'content.xml',
0130                 self.relaxNGValidator)
0131         if (containsRealError(err)):
0132             return err
0133         err = self.validateFile(zip, 'styles.xml',
0134                 self.relaxNGValidator)
0135         if (containsRealError(err)):
0136             return err
0137         err = self.validateFile(zip, 'META-INF/manifest.xml',
0138                 self.relaxNGManifextValidator)
0139         if (containsRealError(err)):
0140             return err
0141         return None
0142
0143     def validateFile(self, zip, file, validator):
0144         try:
0145             data = zip.read(file)
0146             xml = lxml.etree.XML(data);
0147         except lxml.etree.XMLSyntaxError as e:
0148             return e
0149         except KeyError as e:
0150             return e
0151         if len(data) > 1000000:
0152             # if the xml file is larger than 1M, the validator may hang
0153             return
0154         if not validator.validate(xml):
0155             return validator.error_log.last_error
0156
0157 def getExecutablePath(exe):
0158     exepath = None
0159     env = os.environ
0160     for p in env['PATH'].split(':'):
0161         exepath = os.path.join(p, exe)
0162         if os.path.exists(exepath):
0163             break
0164     return exepath
0165
0166 def runCommand(exepath, arguments, captureStdOut):
0167     env = os.environ
0168     stdout = None
0169     if captureStdOut:
0170         (fileno, tmpfilename) = tempfile.mkstemp()
0171         stdout = os.fdopen(fileno, 'w')
0172
0173     args = 'ulimit -v 1000000 -t 60 -c 0;' + exepath
0174     for s in arguments:
0175         args += ' "' + s + '"'
0176
0177     process = subprocess.Popen(['/bin/bash', '-c', args], env=env,
0178         close_fds=True, stdout=stdout, stderr=None)
0179     s = os.wait4(process.pid, os.WNOHANG)
0180     waited = 0
0181     waitstep = 0.1
0182     maxwaittime = 65
0183     while  s[0] == 0 and s[1] == 0 and waited < maxwaittime:
0184         # wait a bit
0185         time.sleep(waitstep)
0186         waited += waitstep
0187         s = os.wait4(process.pid, os.WNOHANG)
0188     if waited >= maxwaittime:
0189         # if the process did not end nicely, kill it
0190         try:
0191             os.killpg(process.pid, signal.SIGKILL)
0192         except:
0193             pass
0194         try:
0195             s = os.wait4(process.pid, 0)
0196         except:
0197             pass
0198     r = object()
0199     r.utime = s[2].ru_utime
0200     r.stime = s[2].ru_stime
0201     r.returnValue = s[1]
0202     r.stdout = []
0203     if captureStdOut:
0204         stdout.close()
0205         stdout = open(tmpfilename, 'r')
0206         r.stdout = stdout.readlines()
0207         stdout.close()
0208         os.remove(tmpfilename)
0209     return r
0210
0211 def profile(dir, file, logger, validator):
0212     global maxbacktraces
0213     logger.startTest(file)
0214     file = os.path.join(dir, file)
0215     (path, ext) = os.path.splitext(file)
0216     ext = ext[1:]
0217     exe = None
0218     for f in applications.keys():
0219         if ext in applications[f]:
0220             exe = f
0221     exepath = getExecutablePath(exe)
0222
0223     # profile
0224     (fileno, tmpfilename) = tempfile.mkstemp()
0225     roundtripfilename = None
0226     args = []
0227     # in case of ODF file, do a roundtrip
0228     m = re.match('.*(\.od.)$', file)
0229     (roundtripfd, roundtripfilename) = tempfile.mkstemp("." + extensions[ext])
0230     if m:
0231         isOdfFile = True
0232     else:
0233         isOdfFile = False
0234     if isOdfFile:
0235         args += ["--roundtrip-filename", roundtripfilename]
0236     args += ["--benchmark-loading", "--profile-filename", tmpfilename,
0237         "--nocrashhandler", file]
0238     r = runCommand(exepath, args, False)
0239     outfile = os.fdopen(fileno, 'r')
0240     r.lines = outfile.readlines()
0241     outfile.close()
0242     os.close(roundtripfd)
0243     r.backtrace = None
0244     if r.returnValue != 0:
0245         if maxbacktraces > 0:
0246                     maxbacktraces -= 1
0247             # generate a backtrace
0248             args = ["--batch", "--eval-command=run",
0249                 "--eval-command=bt", "--args"] + [exepath] + args
0250             exepath = getExecutablePath("gdb")
0251             debugresult = runCommand(exepath, args, True)
0252             r.backtrace = debugresult.stdout
0253             for l in r.backtrace:
0254                 print l.rstrip()
0255                 sys.stdout.flush()
0256             logger.failTest(r.backtrace)
0257         else:
0258             logger.failTest("Crash, no backtrace: limit reached.")
0259     else:
0260         if not isOdfFile:
0261             # convert ms office file to odf
0262             exepath = getExecutablePath("calligraconverter")
0263             args = [file, roundtripfilename]
0264             cr = runCommand(exepath, args, False)
0265
0266         err = validator.validate(roundtripfilename);
0267         if err != None:
0268             logger.failTest(str(err))
0269
0270     # remove the roundtripfile and the temporary file
0271     os.remove(tmpfilename)
0272     os.remove(roundtripfilename)
0273
0274     logger.endTest(int((r.utime + r.stime)*1000))
0275     return r
0276
0277 def getGlobalMinMax(times):
0278     keys = times.min.keys()
0279     min = times.min[keys[0]]
0280     max = times.max[keys[0]]
0281     for k in keys:
0282         if times.min[k] <= min:
0283             min = times.min[k]
0284         if times.max[k] >= max:
0285             max = times.max[k]
0286     return (min, max)
0287
0288 def getGlobalTask(times):
0289     keys = times.min.keys()
0290     (min, max) = getGlobalMinMax(times)
0291     name = None
0292     for k in keys:
0293         if times.min[k] == min and times.max[k] == max:
0294             name = k
0295     return name
0296
0297 def getFurthestEnd(start, times, excludedkey):
0298     end = start
0299     for k in times.min.keys():
0300         if times.min[k] <= start and times.max[k] > end and k != excludedkey:
0301             end = times.max[k]
0302     if end == start:
0303         end = None
0304     return end
0305
0306 def getClosestStart(start, times, excludedkey):
0307     s = 10000000
0308     for k in times.min.keys():
0309         if times.min[k] > start and times.min[k] < s and k != excludedkey:
0310             s = times.min[k]
0311     return s
0312
0313 # if there is one task that starts before all other tasks and ends after all
0314 # other tasks, then count the time that there is no other task running as a sub-
0315 # task of this global task
0316 # if there is no global task, then count the time between the first and last
0317 # timepoint where there is no task running
0318 def getUnaccountedTime(lines, times):
0319     if len(times.min.keys()) == 0:
0320         return 0
0321
0322     globaltask = getGlobalTask(times)
0323     (min, max) = getGlobalMinMax(times)
0324     unaccounted = 0
0325     currentmax = min
0326     currentmin = min
0327     while currentmax != max:
0328         s = getFurthestEnd(currentmax, times, globaltask)
0329         if s == None:
0330             s = getClosestStart(currentmax, times, globaltask)
0331             if s > max:
0332                 s = max
0333             unaccounted += s - currentmax
0334         currentmax = s
0335     return unaccounted
0336
0337 def summarize(lines):
0338     r = object()
0339     r.min = {}
0340     r.max = {}
0341     for l in lines:
0342         (fieldname, time, percentdone) = l.split('\t')
0343         time = int(time)
0344         if not fieldname in r.min or r.min[fieldname] > time:
0345             r.min[fieldname] = time
0346         if not fieldname in r.max or r.max[fieldname] < time:
0347             r.max[fieldname] = time
0348     return r
0349
0350 def profileAll(dir, loggername):
0351     exts = []
0352     for v in applications.values():
0353         for e in v:
0354             exts.append(e);
0355     officefiles = scanDirectory(dir, exts);
0356     usedExts = getExtensions(officefiles)
0357     results = {}
0358     log = logger()
0359     validator = odfvalidator()
0360     for ext in usedExts:
0361         if loggername:
0362             log.startTestSuite(loggername + '-' + ext[1:])
0363         for f in officefiles:
0364             (path, pathext) = os.path.splitext(f)
0365             if pathext == ext:
0366                 relf = os.path.relpath(f, dir)
0367                 result = profile(dir, relf, log, validator)
0368                 results[f] = result
0369         log.endTestSuite()
0370     return results
0371
0372 def addMapEntry(map, start, end):
0373     if start in map:
0374         a = map[start]
0375         if end in a:
0376             a[end] = a[end] + 1
0377         else:
0378             a[end] = 1
0379     else:
0380         a = {}
0381         a[end] = 1
0382     map[start] = a
0383     return map
0384
0385 def createStackTraceGraph(results):
0386     exepath = getExecutablePath('dot')
0387     if not exepath:
0388         return
0389     edges = {}
0390     nodes = {}
0391     n = 0
0392     for file in results.keys():
0393         r = results[file]
0394         if not r.backtrace:
0395             continue
0396         name = os.path.basename(file)
0397         nodes[name] = file
0398         for l in r.backtrace:
0399             l = l.rstrip()
0400             n += 1
0401             m = re.search('/calligra/.*/([^/]+:\d+)$', l)
0402             if m != None:
0403                 key = m.group(1)
0404                 nodes[key] = l
0405                 edges = addMapEntry(edges, key, name)
0406                 name = key
0407
0408     (fileno, tmpfilename) = tempfile.mkstemp()
0409     out = os.fdopen(fileno, 'w')
0410     out.write('digraph {')
0411     svn = 'http://websvn.kde.org/trunk'
0412     for a in nodes:
0413         m = re.search('(/calligra/.*):(\d+)$', nodes[a])
0414         n = '"' + a + '" [URL = "'
0415         if m:
0416             out.write(n + svn + m.group(1) + '?view=markup#l'
0417                 + m.group(2) + '"];')
0418         else:
0419             m = re.search('(/calligratests/.*)', nodes[a])
0420             if m:
0421                 out.write(n + svn + '/tests' + m.group(1)
0422                     + '"];')
0423     for a in edges.keys():
0424         for b in edges[a].keys():
0425             out.write('"' + a + '" -> "' + b + '" [penwidth='
0426                 + str(edges[a][b]) + '];')
0427     out.write('}')
0428     os.fdatasync(fileno)
0429     out.close()
0430
0431     args = ["-Tsvg", "-ostacktraces.svg", tmpfilename]
0432     r = runCommand(exepath, args, False)
0433     os.remove(tmpfilename)
0434
0435 if __name__ == '__main__':
0436     dir = sys.argv[1]
0437     output = sys.argv[2]
0438     loggername = None
0439     if len(sys.argv) > 3:
0440         loggername = sys.argv[3]
0441     results = profileAll(dir, loggername)
0442
0443     fields = {}
0444     for r in results.keys():
0445         results[r].times = summarize(results[r].lines)
0446         results[r].unaccounted = getUnaccountedTime(results[r].lines,
0447             results[r].times)
0448         for f in results[r].times.min:
0449             if not f in fields:
0450                 fields[f] = object()
0451                 fields[f].count = 0
0452                 fields[f].totalduration = 0
0453                 fields[f].totaldurationsquared = 0
0454                 fields[f].min = 10000000
0455             min = results[r].times.min[f]
0456             duration = results[r].times.max[f] - min
0457             fields[f].totalduration += duration
0458             fields[f].totaldurationsquared += duration*duration
0459             fields[f].count += 1
0460             if fields[f].min > min:
0461                 fields[f].min = min
0462     fieldnames = fields.keys()
0463     def sort(keya, keyb):
0464         return cmp(fields[keya].min, fields[keyb].min)
0465     fieldnames.sort(sort)
0466
0467     # collect all fieldnames
0468     out = open(output, 'wb')
0469     # write header
0470     out.write('filename\text\tsize\tr\tutime\tstime\tunaccounted')
0471     for f in fieldnames:
0472         out.write('\t' + f)
0473     out.write('\n')
0474     # write average
0475     out.write('average\t\t\t\t\t\t')
0476     for f in fieldnames:
0477         f = fields[f]
0478         out.write('\t')
0479         if f.count > 0:
0480             out.write(str(f.totalduration/f.count))
0481     out.write('\n')
0482     # write for each analyzed file
0483     for file in results.keys():
0484         r = results[file]
0485         (path, ext) = os.path.splitext(file)
0486         stat = os.stat(file)
0487         out.write(file + '\t' + ext[1:] + '\t' + str(stat.st_size) + '\t' + str(r.returnValue))
0488         out.write('\t' + str(r.utime) + '\t' + str(r.stime))
0489         out.write('\t' + str(r.unaccounted))
0490         for f in fieldnames:
0491             out.write('\t')
0492             if f in r.times.min:
0493                 out.write(str(r.times.max[f]-r.times.min[f]))
0494         out.write('\n')
0495     out.close()
0496
0497     createStackTraceGraph(results)