File indexing completed on 2024-05-12 16:28:27
0001 #! /usr/bin/python -Qwarnall 0002 # 0003 # -*- coding: latin-1 -*- 0004 # 0005 # This script is run by ./validateODF.py 0006 # Author: unknown 0007 # Maintainer: Hanna Skott 0008 # 0009 # This file contains a script that converts documents to ODF format using calligraconverter 0010 # it also validates the content of the resulting ODF file against RelaxNG XML using jing. 0011 # 0012 # EXAMPLE of use: $ ./convertAndValidateODF.py validate_or_not file_directory filename 0013 # validate_or_not values are: yes, no 0014 # NOTE! Jing jar has to be in the same directory as this script for this to work, and it must be named jing.jar 0015 # 0016 0017 import sys, os, os.path, tempfile, subprocess, lxml.etree, zipfile, urllib, hashlib, shlex, shutil, re, getopt 0018 0019 # this function gets jing, then for each file in fileList it converts the file, and validates its XML against RelaxNG 0020 def convertAndValidateFilesInDir(dir): 0021 print "dir is "+dir 0022 # insert the path to the directory of interest here 0023 for root, dirs, files in os.walk(dir): 0024 for name in files: 0025 singleFileConvertAndValidate(name, root) 0026 0027 #This function converts a file using calligraconverter and then tries to validate the resulting ODF file against RelaxNG using jing 0028 def singleFileConvertAndValidate(filename, validate): 0029 0030 filepath = filename 0031 # Create a filename for the output odt file 0032 filename, extension = os.path.splitext(filepath) 0033 0034 src_extension = getExtByMime(filepath) 0035 if src_extension is not None: 0036 # Create filename for where conversion is stored 0037 dst_extension = getConvertExtension(src_extension) 0038 convertedfile = filepath + "." + dst_extension 0039 0040 if "." + dst_extension == src_extension: 0041 0042 applicationname = getApplicationName(dst_extension) 0043 # Do the conversion 0044 args = [applicationname, "--roundtrip-filename", convertedfile, filepath] 0045 else: 0046 args = ["calligraconverter", "--batch", filepath, convertedfile] 0047 0048 print args 0049 fnull = open(os.devnull, 'w') 0050 p = subprocess.call(args, stdout = fnull, stderr = fnull) 0051 0052 0053 # validate out.odt 0054 if not os.path.exists(convertedfile): 0055 print "converted file "+convertedfile+" does not EXIST - Conversion failed!" 0056 return 1 0057 else: 0058 if validate: 0059 # Get jing and validate the odf file according to RelaxNG 0060 jingjar = newGetJing() 0061 validator = jingodfvalidator(jingjar) 0062 e = validator.validate(convertedfile) 0063 if not e: 0064 print "file "+convertedfile+" did not validate against RelaxNG - validation failed!" 0065 removeFileCommand = ["rm", "-f", convertedfile] 0066 p = subprocess.call(removeFileCommand) 0067 0068 return 1 0069 else: 0070 removeFileCommand = ["rm", "-f", convertedfile] 0071 p = subprocess.call(removeFileCommand) 0072 return 0 0073 else: 0074 removeFileCommand = ["rm", "-f", convertedfile] 0075 p = subprocess.call(removeFileCommand) 0076 return 0 0077 else: 0078 print "file "+filename+" is not of a file format that can be converted to ODF" 0079 return 1 0080 0081 0082 0083 #def getConversionResultingMime(file): 0084 # file_extension = file.split(".")[-1] 0085 # if file_extension: 0086 # if file_extension == ".doc" or file_extension == ".docx" or file_extension == ".txt": 0087 # textMime = "odt" 0088 # return textMime 0089 # if file_extension == ".ppt" or file_extension == ".pptx": 0090 # pptMime = "odp" 0091 # return pptMime 0092 # if file_extension == ".csv" or file_extension == ".xls" or file_extension == ".xlsx": 0093 # spreadsheetMime = "ods" 0094 # return spreadsheetMime 0095 # else: 0096 # return None 0097 0098 #This function gets the input file formats closest match in ODF format for version 0099 def getExtByMime(filename): 0100 (path, pathext) = os.path.splitext(filename) 0101 return pathext 0102 0103 def getConvertExtension(extension): 0104 if extension == ".odt" or extension == ".docx" or extension == ".doc" or extension == ".txt": 0105 return "odt" 0106 if extension == ".odp" or extension == ".ppt" or extension == ".pptx": 0107 return "odp" 0108 if extension == ".ods" or extension == ".xls" or extension == ".xlsx": 0109 return "ods" 0110 0111 0112 def getApplicationName(extension): 0113 if extension == "odt": 0114 return "calligrawords" 0115 if extension == "odp": 0116 return "calligrastage" 0117 if extension == "ods": 0118 return "calligrasheets" 0119 0120 0121 def getODFVersion(zip): 0122 content = lxml.etree.parse(zip.open("content.xml", "r")) 0123 return content.getroot().get( 0124 "{urn:oasis:names:tc:opendocument:xmlns:office:1.0}version") 0125 0126 def newGetJing(): 0127 currentdir = os.curdir 0128 jingjar = os.path.join(currentdir, "jing.jar") 0129 path = os.path.join(sys.path[0], jingjar) 0130 if os.path.isfile(path): 0131 return path 0132 0133 0134 def getJing(): 0135 jingjar = "jing-20091111/bin/jing.jar" 0136 # jingjar = "jing-20081028/bin/jing.jar" 0137 path = os.path.join(sys.path[0], jingjar) 0138 if os.path.isfile(path): 0139 return path 0140 print "Downloading jing.jar" 0141 z = "jing-20091111.zip" 0142 0143 # VALID 2011-03-29 0144 urllib.urlretrieve("http://code.google.com/p/jing-trang/downloads/detail?name=jing-20091111.zip", z); 0145 0146 # urllib.urlretrieve("http://jing-trang.googlecode.com/files/" + z, z); 0147 zip = zipfile.ZipFile(z, "r"); 0148 zip.extract(jingjar, sys.path[0]) 0149 os.unlink(z) 0150 f = open(path, "rb") 0151 h = hashlib.sha1() 0152 h.update(f.read()) 0153 f.close() 0154 hash = h.hexdigest() 0155 0156 # hash for jing 2011-03-29 0157 if hash != "2e8eacf399249d226ad4f6ca1d6907ff69430118": 0158 # if hash != "60197956be7f8f2e29e1941ca42273abe7315293": 0159 print "Wrong hash code: wrong file." 0160 os.unlink(path) 0161 return 0162 return path 0163 0164 schemas = { 0165 "1.0": ["OpenDocument-schema-v1.0-os.rng", 0166 "OpenDocument-manifest-schema-v1.0-os.rng"], 0167 "1.1": ["OpenDocument-schema-v1.1.rng", 0168 "OpenDocument-manifest-schema-v1.1.rng"], 0169 "1.2": ["OpenDocument-v1.2-cs01-schema-calligra.rng", 0170 "OpenDocument-v1.2-cs01-manifest-schema.rng"] 0171 } 0172 0173 class jingodfvalidator: 0174 def __init__(self, jingjar): 0175 self.jingjar = jingjar; 0176 0177 def validate(self, odfpath): 0178 zip = zipfile.ZipFile(odfpath, 'r') 0179 odfversion = getODFVersion(zip) 0180 if not odfversion in schemas: 0181 return "Document has no version number" 0182 err = self.validateFile(zip, 'content.xml', 0183 schemas[odfversion][0]) 0184 if (err): 0185 return err 0186 err = self.validateFile(zip, 'styles.xml', 0187 schemas[odfversion][0]) 0188 if (err): 0189 return err 0190 err = self.validateFile(zip, 'META-INF/manifest.xml', 0191 schemas[odfversion][1]) 0192 if (err): 0193 return err 0194 return None 0195 0196 def validateFile(self, zip, filepath, schema): 0197 schema = os.path.join(sys.path[0], schema) 0198 suffix = "_" + filepath.replace("/", "_") 0199 tmp = tempfile.NamedTemporaryFile(suffix = suffix) 0200 tmp.write(zip.open(filepath, "r").read()) 0201 tmp.flush() 0202 args = ["java", "-jar", self.jingjar, "-i", schema, tmp.name] 0203 r = subprocess.call(args) 0204 tmp.close() 0205 if r: 0206 return filepath + " is not valid." 0207 0208 def createValidator(name): 0209 xml = lxml.etree.parse(open(os.path.join(sys.path[0], name), "rb")) 0210 return lxml.etree.RelaxNG(xml) 0211 0212 class odfvalidator: 0213 def __init__(self): 0214 path = sys.path[0] 0215 self.validators = {} 0216 for key in schemas.keys(): 0217 self.validators[key] = [ 0218 createValidator(schemas[key][0]), 0219 createValidator(schemas[key][1]) 0220 ] 0221 # returns error string on error, None otherwise 0222 def validate(self, odfpath): 0223 zip = zipfile.ZipFile(odfpath, 'r') 0224 odfversion = getODFVersion(zip) 0225 if not odfversion in schemas: 0226 return "Document has no version number" 0227 err = self.validateFile(zip, 'content.xml', 0228 self.validators[odfversion][0]) 0229 if (err): 0230 return err 0231 err = self.validateFile(zip, 'styles.xml', 0232 self.validators[odfversion][0]) 0233 if (err): 0234 return err 0235 err = self.validateFile(zip, 'META-INF/manifest.xml', 0236 self.validators[odfversion][1]) 0237 if (err): 0238 return err 0239 return None 0240 0241 def validateFile(self, zip, file, validator): 0242 try: 0243 xml = lxml.etree.XML(zip.read(file)); 0244 except lxml.etree.XMLSyntaxError as e: 0245 return file + ':' + str(e) 0246 except KeyError as e: 0247 return e 0248 if not validator.validate(xml): 0249 return file + ':' + str(validator.error_log.last_error) 0250 0251 # 0252 if __name__ == '__main__': 0253 0254 if sys.argv[1] and sys.argv[2] and sys.argv[3]: 0255 if not os.path.exists(sys.argv[2]): 0256 print "Wrong use of script: missing path to directory in which file resides" 0257 sys.exit(1) 0258 else: 0259 #NOTE! sys.argv[1] is hardcoded to "no" in CMakeList.txt under /calligra/tools/scripts/ 0260 if sys.argv[1] == "yes": 0261 jingjar = newGetJing() 0262 if jingjar: 0263 validator = jingodfvalidator(jingjar) 0264 else: 0265 validator = odfvalidator() 0266 0267 filepath = os.path.abspath(os.path.join(sys.argv[2], sys.argv[3])) 0268 if os.path.exists(filepath): 0269 ret = singleFileConvertAndValidate(filepath, True) 0270 sys.exit(ret) 0271 elif sys.argv[1] == "no": 0272 filepath = os.path.abspath(os.path.join(sys.argv[2], sys.argv[3])) 0273 print filepath 0274 if os.path.exists(filepath): 0275 ret = singleFileConvertAndValidate(filepath, False) 0276 sys.exit(ret) 0277 else: 0278 print "Wrong use of script: validation neither yes or no" 0279 sys.exit(1) 0280 else: 0281 print "Wrong use of script: parameters missing:" 0282 sys.exit(1)