File indexing completed on 2024-05-12 16:28:27

0001 #!/usr/bin/env python
0002 
0003 import sys, os, tempfile, subprocess, lxml.etree, zipfile, urllib, hashlib
0004 
0005 def getODFVersion(zip):
0006     content = lxml.etree.parse(zip.open("content.xml", "r"))
0007     return content.getroot().get(
0008         "{urn:oasis:names:tc:opendocument:xmlns:office:1.0}version")
0009 
0010 def getJing():
0011     jingjar = "jing-20091111/bin/jing.jar"
0012     path = os.path.join(sys.path[0], jingjar)
0013     if os.path.isfile(path):
0014         return path
0015     print "Downloading jing.jar"
0016     z = "jing-20091111.zip"
0017     urllib.urlretrieve("http://jing-trang.googlecode.com/files/" + z, z);
0018     zip = zipfile.ZipFile(z, "r");
0019     zip.extract(jingjar, sys.path[0])
0020     os.unlink(z)
0021     f = open(path, "rb")
0022     h = hashlib.sha1()
0023     h.update(f.read())
0024     f.close()
0025     hash = h.hexdigest()
0026     if hash != "daa0cf7b1679264f8e68171f7f253255794773f7":
0027         print "Wrong hash code: wrong file."
0028         os.unlink(path)
0029         return
0030     return path
0031 
0032 schemas = {
0033     "1.0": ["OpenDocument-schema-v1.0-os.rng",
0034         "OpenDocument-manifest-schema-v1.0-os.rng"],
0035     "1.1": ["OpenDocument-schema-v1.1.rng",
0036         "OpenDocument-manifest-schema-v1.1.rng"],
0037     "1.2": ["OpenDocument-v1.2-cs01-schema-calligra.rng",
0038         "OpenDocument-v1.2-cs01-manifest-schema.rng"]
0039 }
0040 
0041 def getScriptPath():
0042     return os.path.dirname(os.path.realpath(sys.argv[0]))
0043 
0044 class jingodfvalidator:
0045     def __init__(self, jingjar):
0046         self.jingjar = jingjar;
0047         self.xmlparser = lxml.etree.XMLParser()
0048         xsltpath = os.path.join(getScriptPath(), "removeForeign.xsl")
0049         self.removeForeignXSLT = self.loadXSLT(xsltpath)
0050 
0051     def validate(self, odfpath):
0052         try:
0053             zip = zipfile.ZipFile(odfpath, 'r')
0054         except:
0055             self.validateFlatXML(odfpath)
0056             return
0057         odfversion = getODFVersion(zip)
0058         if not odfversion in schemas:
0059             return "Document has no version number"
0060         err = self.validateFile(zip, 'content.xml',
0061                 schemas[odfversion][0])
0062         if (err):
0063             return err
0064         err = self.validateFile(zip, 'styles.xml',
0065                 schemas[odfversion][0])
0066         if (err):
0067             return err
0068         err = self.validateFile(zip, 'META-INF/manifest.xml',
0069                 schemas[odfversion][1])
0070         if (err):
0071             return err
0072         err = self.validateFile(zip, 'meta.xml',
0073                 schemas[odfversion][0])
0074         if (err):
0075             return err
0076         err = self.validateFile(zip, 'settings.xml',
0077                 schemas[odfversion][0])
0078         if (err):
0079             return err
0080         return None
0081 
0082     def validateFlatXML(self, filepath):
0083         schema = schemas["1.2"][0]
0084         schema = os.path.join(sys.path[0], schema)
0085         r = self.validateXML(schema, filepath)
0086         if r:
0087             return filepath + " is not valid."
0088 
0089     def validateFile(self, zip, filepath, schema):
0090         schema = os.path.join(sys.path[0], schema)
0091         suffix = "_" + filepath.replace("/", "_")
0092         tmp = tempfile.NamedTemporaryFile(suffix = suffix)
0093         tmp.write(zip.open(filepath, "r").read())
0094         tmp.flush()
0095         r = self.validateXML(schema, tmp.name)
0096         tmp.close()
0097         if r:
0098             return filepath + " is not valid."
0099 
0100     def loadXML(self, filepath):
0101         return lxml.etree.parse(open(filepath, 'r'), self.xmlparser)
0102 
0103     def loadXSLT(self, filepath):
0104         xsl = self.loadXML(filepath)
0105         ac = lxml.etree.XSLTAccessControl(read_network=False, write_file=False)
0106         return lxml.etree.XSLT(xsl, access_control=ac)
0107 
0108     def removeForeign(self, filepath):
0109         xml = self.loadXML(filepath)
0110         xml = self.removeForeignXSLT(xml)
0111         xml.write(filepath)
0112 
0113     # Validate the XML and optionally remove the foreign elements and attributes
0114     # first. Calligra currently write ODF 1.2 Extended which is allowed to
0115     # contain foreign elements and attributes. If Calligra adds a mode to save
0116     # ODF 1.2, the validator should not remove them when validation.
0117     def validateXML(self, schema, xmlpath, removeForeign = True):
0118         if removeForeign:
0119             self.removeForeign(xmlpath)
0120 
0121         args = ["java", "-jar", self.jingjar, "-i", schema, xmlpath]
0122         return subprocess.call(args)
0123 
0124 def createValidator(name):
0125     xml = lxml.etree.parse(open(os.path.join(sys.path[0], name), "rb"))
0126     return lxml.etree.RelaxNG(xml)
0127 
0128 class odfvalidator:
0129     def __init__(self):
0130         path = sys.path[0]
0131         self.validators = {}
0132         for key in schemas.keys():
0133             self.validators[key] = [
0134                 createValidator(schemas[key][0]),
0135                 createValidator(schemas[key][1])
0136             ]
0137     # returns error string on error, None otherwise
0138     def validate(self, odfpath): 
0139         zip = zipfile.ZipFile(odfpath, 'r')
0140         odfversion = getODFVersion(zip)
0141         if not odfversion in schemas:
0142             return "Document has no version number"
0143         err = self.validateFile(zip, 'content.xml',
0144                 self.validators[odfversion][0])
0145         if (err):
0146             return err
0147         err = self.validateFile(zip, 'styles.xml',
0148                 self.validators[odfversion][0])
0149         if (err):
0150             return err
0151         err = self.validateFile(zip, 'META-INF/manifest.xml',
0152                 self.validators[odfversion][1])
0153         if (err):
0154             return err
0155         return None
0156 
0157     def validateFile(self, zip, file, validator):
0158         try:
0159             xml = lxml.etree.XML(zip.read(file));
0160         except lxml.etree.XMLSyntaxError as e:
0161             return file + ':' + str(e)
0162         except KeyError as e:
0163             return e
0164         if not validator.validate(xml):
0165             return file + ':' + str(validator.error_log.last_error)
0166 
0167 if __name__ == '__main__':
0168     jingjar = getJing()
0169     if jingjar:
0170         validator = jingodfvalidator(jingjar)
0171     else:
0172         validator = odfvalidator()
0173     for f in sys.argv[1:]:
0174         if os.path.isfile(f):
0175             e = validator.validate(f)
0176             if e:
0177                 print str(e)