File indexing completed on 2024-05-12 16:28:27
0001 #! /usr/bin/python -Qwarnall 0002 0003 # This script profiles loading of documents in Calligra. 0004 # It is called like this: 0005 # profileOfficeFileLoading.py $dir $outputfile 0006 # The script outputs an csv file that contains the times various functions took 0007 # for each file in directory $dir. The directory $dir is scanned recursively. 0008 # 0009 # Copyright 2010 Jos van den Oever <jos@vandenoever.info> 0010 0011 import sys, os, tempfile, time, signal, subprocess, re, lxml.etree, zipfile 0012 0013 applications = { 0014 'calligrawords': ['odt', 'doc', 'docx'], 0015 'calligrastage': ['odp', 'ppt', 'pptx'], 0016 'calligrasheets': ['ods', 'xls', 'xlsx'] 0017 } 0018 extensions = { 0019 'odt': 'odt', 0020 'doc': 'odt', 0021 'docx': 'odt', 0022 'odp': 'odp', 0023 'ppt': 'odp', 0024 'pptx': 'odp', 0025 'ods': 'ods', 0026 'xls': 'ods', 0027 'xlsx': 'ods' 0028 } 0029 0030 # limit how many backtraces are recordes, since it takes a lot of time 0031 maxbacktraces = 50 0032 0033 def scanDirectory(rootdir, extensions): 0034 if os.path.isfile(rootdir): 0035 return [rootdir] 0036 0037 filext = map(lambda e: "." + e, extensions) 0038 list = [] 0039 for root, directories, files in os.walk(rootdir): 0040 for file in files: 0041 if file.startswith('.'): 0042 continue 0043 if any(map(lambda e: file.endswith(e), filext)): 0044 list.append(os.path.join(root, file)) 0045 return frozenset(list) 0046 0047 def getExtensions(list): 0048 extensions = [] 0049 for file in list: 0050 (path, ext) = os.path.splitext(file) 0051 extensions.append(ext) 0052 return frozenset(extensions) 0053 0054 class object: 0055 def __init__(self): 0056 pass 0057 0058 # functions for logging, in this case to TeamCity 0059 class logger: 0060 def __init__(self): 0061 self.suitename = None 0062 def escape(self, text): 0063 def escape(m): 0064 if m.group(0) == '\n': 0065 return '|n' 0066 if m.group(0) == '\r': 0067 return '|r' 0068 return '|'+m.group(0) 0069 return re.sub('[\'\n\r\|\]]', escape, text) 0070 def startTestSuite(self, name): 0071 self.suitename = name 0072 print "##teamcity[testSuiteStarted name='" + self.suitename \ 0073 + "']" 0074 sys.stdout.flush() 0075 def endTestSuite(self): 0076 if not self.suitename: return 0077 print "##teamcity[testSuiteFinished name='" + self.suitename \ 0078 + "']" 0079 sys.stdout.flush() 0080 self.suitename = None 0081 def startTest(self, name): 0082 if not self.suitename: return 0083 self.testname = name 0084 print "##teamcity[testStarted name='" + self.testname + "']" 0085 sys.stdout.flush() 0086 # fail the current test 0087 def failTest(self, backtrace): 0088 if not self.suitename or not self.testname: return 0089 bt = '' 0090 for l in backtrace: 0091 bt = bt + self.escape(l) 0092 print "##teamcity[testFailed name='" + self.testname \ 0093 + "' details='" + bt + "']" 0094 sys.stdout.flush() 0095 # end test, pass duration as integer representing the milliseconds 0096 def endTest(self, duration): 0097 if not self.suitename or not self.testname: return 0098 print "##teamcity[testFinished name='" + self.testname \ 0099 + "' duration='" + str(duration) + "']" 0100 sys.stdout.flush() 0101 self.testname = None 0102 0103 def containsRealError(err): 0104 # some errors reported by libxml2 are false positives, we filter them 0105 # out 0106 if str(err).find("ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element styles failed to validate content") != -1: 0107 return None 0108 if str(err).find("ERROR:RELAXNGV:RELAXNG_ERR_CONTENTVALID: Element automatic-styles failed to validate content") != -1: 0109 return None 0110 return err 0111 0112 class odfvalidator: 0113 def __init__(self): 0114 path = sys.path[0] 0115 self.relaxNGValidator = lxml.etree.RelaxNG( \ 0116 lxml.etree.parse(open(os.path.join(path, \ 0117 'OpenDocument-v1.2-cs01-schema-calligra.rng'), 0118 'r'))) 0119 self.relaxNGManifextValidator = lxml.etree.RelaxNG( \ 0120 lxml.etree.parse(open(os.path.join(path, \ 0121 'OpenDocument-v1.2-cs01-manifest-schema.rng'), \ 0122 'r'))) 0123 # returns error string on error, None otherwise 0124 def validate(self, odtpath): 0125 try: 0126 zip = zipfile.ZipFile(odtpath, 'r') 0127 except zipfile.BadZipfile as e: 0128 return str(e) 0129 err = self.validateFile(zip, 'content.xml', 0130 self.relaxNGValidator) 0131 if (containsRealError(err)): 0132 return err 0133 err = self.validateFile(zip, 'styles.xml', 0134 self.relaxNGValidator) 0135 if (containsRealError(err)): 0136 return err 0137 err = self.validateFile(zip, 'META-INF/manifest.xml', 0138 self.relaxNGManifextValidator) 0139 if (containsRealError(err)): 0140 return err 0141 return None 0142 0143 def validateFile(self, zip, file, validator): 0144 try: 0145 data = zip.read(file) 0146 xml = lxml.etree.XML(data); 0147 except lxml.etree.XMLSyntaxError as e: 0148 return e 0149 except KeyError as e: 0150 return e 0151 if len(data) > 1000000: 0152 # if the xml file is larger than 1M, the validator may hang 0153 return 0154 if not validator.validate(xml): 0155 return validator.error_log.last_error 0156 0157 def getExecutablePath(exe): 0158 exepath = None 0159 env = os.environ 0160 for p in env['PATH'].split(':'): 0161 exepath = os.path.join(p, exe) 0162 if os.path.exists(exepath): 0163 break 0164 return exepath 0165 0166 def runCommand(exepath, arguments, captureStdOut): 0167 env = os.environ 0168 stdout = None 0169 if captureStdOut: 0170 (fileno, tmpfilename) = tempfile.mkstemp() 0171 stdout = os.fdopen(fileno, 'w') 0172 0173 args = 'ulimit -v 1000000 -t 60 -c 0;' + exepath 0174 for s in arguments: 0175 args += ' "' + s + '"' 0176 0177 process = subprocess.Popen(['/bin/bash', '-c', args], env=env, 0178 close_fds=True, stdout=stdout, stderr=None) 0179 s = os.wait4(process.pid, os.WNOHANG) 0180 waited = 0 0181 waitstep = 0.1 0182 maxwaittime = 65 0183 while s[0] == 0 and s[1] == 0 and waited < maxwaittime: 0184 # wait a bit 0185 time.sleep(waitstep) 0186 waited += waitstep 0187 s = os.wait4(process.pid, os.WNOHANG) 0188 if waited >= maxwaittime: 0189 # if the process did not end nicely, kill it 0190 try: 0191 os.killpg(process.pid, signal.SIGKILL) 0192 except: 0193 pass 0194 try: 0195 s = os.wait4(process.pid, 0) 0196 except: 0197 pass 0198 r = object() 0199 r.utime = s[2].ru_utime 0200 r.stime = s[2].ru_stime 0201 r.returnValue = s[1] 0202 r.stdout = [] 0203 if captureStdOut: 0204 stdout.close() 0205 stdout = open(tmpfilename, 'r') 0206 r.stdout = stdout.readlines() 0207 stdout.close() 0208 os.remove(tmpfilename) 0209 return r 0210 0211 def profile(dir, file, logger, validator): 0212 global maxbacktraces 0213 logger.startTest(file) 0214 file = os.path.join(dir, file) 0215 (path, ext) = os.path.splitext(file) 0216 ext = ext[1:] 0217 exe = None 0218 for f in applications.keys(): 0219 if ext in applications[f]: 0220 exe = f 0221 exepath = getExecutablePath(exe) 0222 0223 # profile 0224 (fileno, tmpfilename) = tempfile.mkstemp() 0225 roundtripfilename = None 0226 args = [] 0227 # in case of ODF file, do a roundtrip 0228 m = re.match('.*(\.od.)$', file) 0229 (roundtripfd, roundtripfilename) = tempfile.mkstemp("." + extensions[ext]) 0230 if m: 0231 isOdfFile = True 0232 else: 0233 isOdfFile = False 0234 if isOdfFile: 0235 args += ["--roundtrip-filename", roundtripfilename] 0236 args += ["--benchmark-loading", "--profile-filename", tmpfilename, 0237 "--nocrashhandler", file] 0238 r = runCommand(exepath, args, False) 0239 outfile = os.fdopen(fileno, 'r') 0240 r.lines = outfile.readlines() 0241 outfile.close() 0242 os.close(roundtripfd) 0243 r.backtrace = None 0244 if r.returnValue != 0: 0245 if maxbacktraces > 0: 0246 maxbacktraces -= 1 0247 # generate a backtrace 0248 args = ["--batch", "--eval-command=run", 0249 "--eval-command=bt", "--args"] + [exepath] + args 0250 exepath = getExecutablePath("gdb") 0251 debugresult = runCommand(exepath, args, True) 0252 r.backtrace = debugresult.stdout 0253 for l in r.backtrace: 0254 print l.rstrip() 0255 sys.stdout.flush() 0256 logger.failTest(r.backtrace) 0257 else: 0258 logger.failTest("Crash, no backtrace: limit reached.") 0259 else: 0260 if not isOdfFile: 0261 # convert ms office file to odf 0262 exepath = getExecutablePath("calligraconverter") 0263 args = [file, roundtripfilename] 0264 cr = runCommand(exepath, args, False) 0265 0266 err = validator.validate(roundtripfilename); 0267 if err != None: 0268 logger.failTest(str(err)) 0269 0270 # remove the roundtripfile and the temporary file 0271 os.remove(tmpfilename) 0272 os.remove(roundtripfilename) 0273 0274 logger.endTest(int((r.utime + r.stime)*1000)) 0275 return r 0276 0277 def getGlobalMinMax(times): 0278 keys = times.min.keys() 0279 min = times.min[keys[0]] 0280 max = times.max[keys[0]] 0281 for k in keys: 0282 if times.min[k] <= min: 0283 min = times.min[k] 0284 if times.max[k] >= max: 0285 max = times.max[k] 0286 return (min, max) 0287 0288 def getGlobalTask(times): 0289 keys = times.min.keys() 0290 (min, max) = getGlobalMinMax(times) 0291 name = None 0292 for k in keys: 0293 if times.min[k] == min and times.max[k] == max: 0294 name = k 0295 return name 0296 0297 def getFurthestEnd(start, times, excludedkey): 0298 end = start 0299 for k in times.min.keys(): 0300 if times.min[k] <= start and times.max[k] > end and k != excludedkey: 0301 end = times.max[k] 0302 if end == start: 0303 end = None 0304 return end 0305 0306 def getClosestStart(start, times, excludedkey): 0307 s = 10000000 0308 for k in times.min.keys(): 0309 if times.min[k] > start and times.min[k] < s and k != excludedkey: 0310 s = times.min[k] 0311 return s 0312 0313 # if there is one task that starts before all other tasks and ends after all 0314 # other tasks, then count the time that there is no other task running as a sub- 0315 # task of this global task 0316 # if there is no global task, then count the time between the first and last 0317 # timepoint where there is no task running 0318 def getUnaccountedTime(lines, times): 0319 if len(times.min.keys()) == 0: 0320 return 0 0321 0322 globaltask = getGlobalTask(times) 0323 (min, max) = getGlobalMinMax(times) 0324 unaccounted = 0 0325 currentmax = min 0326 currentmin = min 0327 while currentmax != max: 0328 s = getFurthestEnd(currentmax, times, globaltask) 0329 if s == None: 0330 s = getClosestStart(currentmax, times, globaltask) 0331 if s > max: 0332 s = max 0333 unaccounted += s - currentmax 0334 currentmax = s 0335 return unaccounted 0336 0337 def summarize(lines): 0338 r = object() 0339 r.min = {} 0340 r.max = {} 0341 for l in lines: 0342 (fieldname, time, percentdone) = l.split('\t') 0343 time = int(time) 0344 if not fieldname in r.min or r.min[fieldname] > time: 0345 r.min[fieldname] = time 0346 if not fieldname in r.max or r.max[fieldname] < time: 0347 r.max[fieldname] = time 0348 return r 0349 0350 def profileAll(dir, loggername): 0351 exts = [] 0352 for v in applications.values(): 0353 for e in v: 0354 exts.append(e); 0355 officefiles = scanDirectory(dir, exts); 0356 usedExts = getExtensions(officefiles) 0357 results = {} 0358 log = logger() 0359 validator = odfvalidator() 0360 for ext in usedExts: 0361 if loggername: 0362 log.startTestSuite(loggername + '-' + ext[1:]) 0363 for f in officefiles: 0364 (path, pathext) = os.path.splitext(f) 0365 if pathext == ext: 0366 relf = os.path.relpath(f, dir) 0367 result = profile(dir, relf, log, validator) 0368 results[f] = result 0369 log.endTestSuite() 0370 return results 0371 0372 def addMapEntry(map, start, end): 0373 if start in map: 0374 a = map[start] 0375 if end in a: 0376 a[end] = a[end] + 1 0377 else: 0378 a[end] = 1 0379 else: 0380 a = {} 0381 a[end] = 1 0382 map[start] = a 0383 return map 0384 0385 def createStackTraceGraph(results): 0386 exepath = getExecutablePath('dot') 0387 if not exepath: 0388 return 0389 edges = {} 0390 nodes = {} 0391 n = 0 0392 for file in results.keys(): 0393 r = results[file] 0394 if not r.backtrace: 0395 continue 0396 name = os.path.basename(file) 0397 nodes[name] = file 0398 for l in r.backtrace: 0399 l = l.rstrip() 0400 n += 1 0401 m = re.search('/calligra/.*/([^/]+:\d+)$', l) 0402 if m != None: 0403 key = m.group(1) 0404 nodes[key] = l 0405 edges = addMapEntry(edges, key, name) 0406 name = key 0407 0408 (fileno, tmpfilename) = tempfile.mkstemp() 0409 out = os.fdopen(fileno, 'w') 0410 out.write('digraph {') 0411 svn = 'http://websvn.kde.org/trunk' 0412 for a in nodes: 0413 m = re.search('(/calligra/.*):(\d+)$', nodes[a]) 0414 n = '"' + a + '" [URL = "' 0415 if m: 0416 out.write(n + svn + m.group(1) + '?view=markup#l' 0417 + m.group(2) + '"];') 0418 else: 0419 m = re.search('(/calligratests/.*)', nodes[a]) 0420 if m: 0421 out.write(n + svn + '/tests' + m.group(1) 0422 + '"];') 0423 for a in edges.keys(): 0424 for b in edges[a].keys(): 0425 out.write('"' + a + '" -> "' + b + '" [penwidth=' 0426 + str(edges[a][b]) + '];') 0427 out.write('}') 0428 os.fdatasync(fileno) 0429 out.close() 0430 0431 args = ["-Tsvg", "-ostacktraces.svg", tmpfilename] 0432 r = runCommand(exepath, args, False) 0433 os.remove(tmpfilename) 0434 0435 if __name__ == '__main__': 0436 dir = sys.argv[1] 0437 output = sys.argv[2] 0438 loggername = None 0439 if len(sys.argv) > 3: 0440 loggername = sys.argv[3] 0441 results = profileAll(dir, loggername) 0442 0443 fields = {} 0444 for r in results.keys(): 0445 results[r].times = summarize(results[r].lines) 0446 results[r].unaccounted = getUnaccountedTime(results[r].lines, 0447 results[r].times) 0448 for f in results[r].times.min: 0449 if not f in fields: 0450 fields[f] = object() 0451 fields[f].count = 0 0452 fields[f].totalduration = 0 0453 fields[f].totaldurationsquared = 0 0454 fields[f].min = 10000000 0455 min = results[r].times.min[f] 0456 duration = results[r].times.max[f] - min 0457 fields[f].totalduration += duration 0458 fields[f].totaldurationsquared += duration*duration 0459 fields[f].count += 1 0460 if fields[f].min > min: 0461 fields[f].min = min 0462 fieldnames = fields.keys() 0463 def sort(keya, keyb): 0464 return cmp(fields[keya].min, fields[keyb].min) 0465 fieldnames.sort(sort) 0466 0467 # collect all fieldnames 0468 out = open(output, 'wb') 0469 # write header 0470 out.write('filename\text\tsize\tr\tutime\tstime\tunaccounted') 0471 for f in fieldnames: 0472 out.write('\t' + f) 0473 out.write('\n') 0474 # write average 0475 out.write('average\t\t\t\t\t\t') 0476 for f in fieldnames: 0477 f = fields[f] 0478 out.write('\t') 0479 if f.count > 0: 0480 out.write(str(f.totalduration/f.count)) 0481 out.write('\n') 0482 # write for each analyzed file 0483 for file in results.keys(): 0484 r = results[file] 0485 (path, ext) = os.path.splitext(file) 0486 stat = os.stat(file) 0487 out.write(file + '\t' + ext[1:] + '\t' + str(stat.st_size) + '\t' + str(r.returnValue)) 0488 out.write('\t' + str(r.utime) + '\t' + str(r.stime)) 0489 out.write('\t' + str(r.unaccounted)) 0490 for f in fieldnames: 0491 out.write('\t') 0492 if f in r.times.min: 0493 out.write(str(r.times.max[f]-r.times.min[f])) 0494 out.write('\n') 0495 out.close() 0496 0497 createStackTraceGraph(results)