File indexing completed on 2024-05-12 16:28:27

0001 #! /usr/bin/python -Qwarnall
0002 #
0003 # -*- coding: latin-1 -*-
0004 #
0005 # This script is run by ./validateODF.py
0006 # Author: unknown
0007 # Maintainer: Hanna Skott
0008 #
0009 # This file contains a script that converts documents to ODF format using calligraconverter
0010 # it also validates the content of the resulting ODF file against RelaxNG XML using jing.
0011 #
0012 # EXAMPLE of use: $ ./convertAndValidateODF.py validate_or_not file_directory filename
0013 # validate_or_not values are: yes, no
0014 # NOTE! Jing jar has to be in the same directory as this script for this to work, and it must be named jing.jar
0015 #  
0016 
0017 import sys, os, os.path, tempfile, subprocess, lxml.etree, zipfile, urllib, hashlib, shlex, shutil, re, getopt
0018 
0019 # this function gets jing, then for each file in fileList it converts the file, and validates its XML against RelaxNG
0020 def convertAndValidateFilesInDir(dir):
0021     print "dir is "+dir
0022         # insert the path to the directory of interest here
0023     for root, dirs, files in os.walk(dir):
0024         for name in files:
0025             singleFileConvertAndValidate(name, root)
0026 
0027 #This function converts a file using calligraconverter and then tries to validate the resulting ODF file against RelaxNG using jing
0028 def singleFileConvertAndValidate(filename, validate):
0029     
0030     filepath = filename
0031         # Create a filename for the output odt file
0032         filename, extension = os.path.splitext(filepath)
0033 
0034         src_extension = getExtByMime(filepath)
0035         if src_extension is not None:
0036             # Create filename for where conversion is stored
0037             dst_extension = getConvertExtension(src_extension)
0038                 convertedfile = filepath + "." + dst_extension
0039             
0040             if "." + dst_extension == src_extension:
0041 
0042                     applicationname = getApplicationName(dst_extension)
0043                     # Do the conversion
0044                     args = [applicationname, "--roundtrip-filename", convertedfile, filepath]
0045                 else:
0046                     args = ["calligraconverter", "--batch", filepath, convertedfile]
0047                     
0048                 print args
0049                 fnull = open(os.devnull, 'w')
0050                 p = subprocess.call(args, stdout = fnull, stderr = fnull)
0051                 
0052 
0053                 # validate out.odt
0054                 if not os.path.exists(convertedfile):
0055                         print "converted file "+convertedfile+" does not EXIST - Conversion failed!"
0056             return 1
0057                 else:   
0058             if validate:
0059                 # Get jing and validate the odf file according to RelaxNG
0060                             jingjar = newGetJing()
0061                             validator = jingodfvalidator(jingjar)
0062                             e = validator.validate(convertedfile)
0063                             if not e:
0064                     print "file "+convertedfile+" did not validate against RelaxNG - validation failed!"
0065                     removeFileCommand = ["rm", "-f", convertedfile]
0066                             p = subprocess.call(removeFileCommand)
0067                         
0068                                     return 1
0069                             else:
0070                     removeFileCommand = ["rm", "-f", convertedfile]
0071                             p = subprocess.call(removeFileCommand)
0072                     return 0
0073             else:   
0074                 removeFileCommand = ["rm", "-f", convertedfile]
0075                 p = subprocess.call(removeFileCommand)
0076                 return 0
0077     else:
0078             print "file "+filename+" is not of a file format that can be converted to ODF"
0079                 return 1
0080 
0081 
0082     
0083 #def getConversionResultingMime(file):
0084 #        file_extension = file.split(".")[-1]
0085 #        if file_extension:
0086 #                if file_extension == ".doc" or file_extension == ".docx" or file_extension == ".txt":
0087 #           textMime = "odt"
0088 #           return textMime
0089 #                if file_extension == ".ppt" or file_extension == ".pptx":
0090 #           pptMime = "odp"
0091 #           return pptMime
0092 #                if file_extension == ".csv" or file_extension == ".xls" or file_extension == ".xlsx":
0093 #           spreadsheetMime = "ods"
0094 #           return spreadsheetMime
0095 #       else:
0096 #           return None
0097 
0098 #This function gets the input file formats closest match in ODF format for version
0099 def getExtByMime(filename):
0100         (path, pathext) = os.path.splitext(filename)
0101         return pathext
0102 
0103 def getConvertExtension(extension):
0104     if extension == ".odt" or extension == ".docx" or extension == ".doc" or extension == ".txt":
0105         return "odt"
0106     if extension == ".odp" or extension == ".ppt" or extension == ".pptx":
0107         return "odp"
0108     if extension == ".ods" or extension == ".xls" or extension == ".xlsx":
0109         return "ods"
0110 
0111 
0112 def getApplicationName(extension):
0113     if extension == "odt":
0114         return "calligrawords"
0115     if extension == "odp":
0116         return "calligrastage"
0117     if extension == "ods":
0118         return "calligrasheets"
0119 
0120 
0121 def getODFVersion(zip):
0122     content = lxml.etree.parse(zip.open("content.xml", "r"))
0123     return content.getroot().get(
0124         "{urn:oasis:names:tc:opendocument:xmlns:office:1.0}version")
0125 
0126 def newGetJing():
0127     currentdir = os.curdir
0128     jingjar = os.path.join(currentdir, "jing.jar")
0129     path = os.path.join(sys.path[0], jingjar)
0130     if os.path.isfile(path):
0131         return path
0132         
0133 
0134 def getJing():
0135     jingjar = "jing-20091111/bin/jing.jar"
0136     # jingjar = "jing-20081028/bin/jing.jar"
0137     path = os.path.join(sys.path[0], jingjar)
0138     if os.path.isfile(path):
0139         return path
0140     print "Downloading jing.jar"
0141     z = "jing-20091111.zip"
0142     
0143     # VALID 2011-03-29
0144     urllib.urlretrieve("http://code.google.com/p/jing-trang/downloads/detail?name=jing-20091111.zip", z);
0145     
0146     # urllib.urlretrieve("http://jing-trang.googlecode.com/files/" + z, z);
0147     zip = zipfile.ZipFile(z, "r");
0148     zip.extract(jingjar, sys.path[0])
0149     os.unlink(z)
0150     f = open(path, "rb")
0151     h = hashlib.sha1()
0152     h.update(f.read())
0153     f.close()
0154     hash = h.hexdigest()
0155     
0156     # hash for jing 2011-03-29 
0157     if hash != "2e8eacf399249d226ad4f6ca1d6907ff69430118":
0158     # if hash != "60197956be7f8f2e29e1941ca42273abe7315293":
0159         print "Wrong hash code: wrong file."
0160         os.unlink(path)
0161         return
0162     return path
0163 
0164 schemas = {
0165     "1.0": ["OpenDocument-schema-v1.0-os.rng",
0166         "OpenDocument-manifest-schema-v1.0-os.rng"],
0167     "1.1": ["OpenDocument-schema-v1.1.rng",
0168         "OpenDocument-manifest-schema-v1.1.rng"],
0169     "1.2": ["OpenDocument-v1.2-cs01-schema-calligra.rng",
0170         "OpenDocument-v1.2-cs01-manifest-schema.rng"]
0171 }
0172 
0173 class jingodfvalidator:
0174     def __init__(self, jingjar):
0175         self.jingjar = jingjar;
0176 
0177     def validate(self, odfpath):
0178         zip = zipfile.ZipFile(odfpath, 'r')
0179         odfversion = getODFVersion(zip)
0180         if not odfversion in schemas:
0181             return "Document has no version number"
0182         err = self.validateFile(zip, 'content.xml',
0183                 schemas[odfversion][0])
0184         if (err):
0185             return err
0186         err = self.validateFile(zip, 'styles.xml',
0187                 schemas[odfversion][0])
0188         if (err):
0189             return err
0190         err = self.validateFile(zip, 'META-INF/manifest.xml',
0191                 schemas[odfversion][1])
0192         if (err):
0193             return err
0194         return None
0195 
0196     def validateFile(self, zip, filepath, schema):
0197         schema = os.path.join(sys.path[0], schema)
0198         suffix = "_" + filepath.replace("/", "_")
0199         tmp = tempfile.NamedTemporaryFile(suffix = suffix)
0200         tmp.write(zip.open(filepath, "r").read())
0201         tmp.flush()
0202         args = ["java", "-jar", self.jingjar, "-i", schema, tmp.name]
0203         r = subprocess.call(args)
0204         tmp.close()
0205         if r:
0206             return filepath + " is not valid."
0207 
0208 def createValidator(name):
0209     xml = lxml.etree.parse(open(os.path.join(sys.path[0], name), "rb"))
0210     return lxml.etree.RelaxNG(xml)
0211 
0212 class odfvalidator:
0213     def __init__(self):
0214         path = sys.path[0]
0215         self.validators = {}
0216         for key in schemas.keys():
0217             self.validators[key] = [
0218                 createValidator(schemas[key][0]),
0219                 createValidator(schemas[key][1])
0220             ]
0221     # returns error string on error, None otherwise
0222     def validate(self, odfpath): 
0223         zip = zipfile.ZipFile(odfpath, 'r')
0224         odfversion = getODFVersion(zip)
0225         if not odfversion in schemas:
0226             return "Document has no version number"
0227         err = self.validateFile(zip, 'content.xml',
0228                 self.validators[odfversion][0])
0229         if (err):
0230             return err
0231         err = self.validateFile(zip, 'styles.xml',
0232                 self.validators[odfversion][0])
0233         if (err):
0234             return err
0235         err = self.validateFile(zip, 'META-INF/manifest.xml',
0236                 self.validators[odfversion][1])
0237         if (err):
0238             return err
0239         return None
0240 
0241     def validateFile(self, zip, file, validator):
0242         try:
0243             xml = lxml.etree.XML(zip.read(file));
0244         except lxml.etree.XMLSyntaxError as e:
0245             return file + ':' + str(e)
0246         except KeyError as e:
0247             return e
0248         if not validator.validate(xml):
0249             return file + ':' + str(validator.error_log.last_error)
0250 
0251 #
0252 if __name__ == '__main__':
0253 
0254     if sys.argv[1] and sys.argv[2] and sys.argv[3]:
0255         if not os.path.exists(sys.argv[2]):
0256             print "Wrong use of script: missing path to directory in which file resides"
0257             sys.exit(1)
0258         else:
0259             #NOTE! sys.argv[1] is hardcoded to "no" in CMakeList.txt under /calligra/tools/scripts/
0260             if sys.argv[1] == "yes":
0261                 jingjar = newGetJing()
0262                                 if jingjar:
0263                                        validator = jingodfvalidator(jingjar)
0264                                 else:
0265                                        validator = odfvalidator()
0266             
0267                 filepath = os.path.abspath(os.path.join(sys.argv[2], sys.argv[3]))
0268                         if os.path.exists(filepath):
0269                                     ret = singleFileConvertAndValidate(filepath, True)
0270                     sys.exit(ret)
0271             elif sys.argv[1] == "no":
0272                 filepath = os.path.abspath(os.path.join(sys.argv[2], sys.argv[3]))
0273                 print filepath
0274                         if os.path.exists(filepath):
0275                                     ret = singleFileConvertAndValidate(filepath, False)
0276                                     sys.exit(ret)
0277             else:
0278                 print "Wrong use of script: validation neither yes or no"
0279                 sys.exit(1)
0280     else:
0281         print "Wrong use of script: parameters missing:" 
0282         sys.exit(1)