File indexing completed on 2025-02-02 04:58:02
0001 #!/usr/bin/env python 0002 # -*- coding: iso-8859-1 -*- 0003 # kate: replace-tabs off; 0004 # *************************************************************************** 0005 # copyright : (C) 2006-2010 by Mathias Monnerville 0006 # email : tellico@monnerville.com 0007 # *************************************************************************** 0008 # 0009 # *************************************************************************** 0010 # * * 0011 # * This program is free software; you can redistribute it and/or modify * 0012 # * it under the terms of version 2 of the GNU General Public License as * 0013 # * published by the Free Software Foundation; * 0014 # * * 0015 # *************************************************************************** 0016 # 0017 # Version 0.7.3: 2010-12-07 (Reported by Romain Henriet) 0018 # * Fixed some regexp issues 0019 # * Better handling of image parsing/fetching errors 0020 # 0021 # Version 0.7.2.1: 2010-07-27 (Reported by Romain Henriet) 0022 # * Updated title match to allow searching without diacritical marks 0023 # 0024 # Version 0.7.2: 2010-05-27 (Reported by Romain Henriet) 0025 # * Fixed bug preventing searches with accent marks 0026 # * Added post-processing cleanup action to replace raw HTML entities with 0027 # their ISO Latin-1 replacement text 0028 # 0029 # Version 0.7.1: 2010-04-26 (Thanks to Romain Henriet <romain-devel@laposte.net>) 0030 # * Fixed greedy regexp for genre. Fixed nationality output. Add studio. 0031 # 0032 # Version 0.7: 2009-11-12 0033 # * Allocine has a brand new website. All regexps were broken. 0034 # 0035 # Version 0.6: 2009-03-04 (Thanks to R. Fischer and Henry-Nicolas Tourneur) 0036 # * Fixed parsing issues (various RegExp issues due to allocine's HTML changes) 0037 # 0038 # Version 0.5: 2009-01-21 (Changes contributed by R. Fischer <fischer.tellico@free.fr>) 0039 # * Added complete distribution of actors and roles, Genres, Nationalities, producers, composer and scenarist 0040 # * Fixed the plot field that returned a wrong answer when no plot is available 0041 # * Fixed a bug related to parameters encoding 0042 # 0043 # Version 0.4: 0044 # * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres 0045 # could not be retrieved. Fixed bad http request error due to some changes in HTML code. 0046 # 0047 # Version 0.3: 0048 # * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed. 0049 # 0050 # Version 0.2: 0051 # * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore. 0052 # 0053 # Version 0.1: 0054 # * Initial release. 0055 0056 import sys, os, re, hashlib, random, types 0057 import urllib, time, base64 0058 import xml.dom.minidom 0059 import locale 0060 try: 0061 import htmlentitydefs as htmlents 0062 except ImportError: 0063 try: 0064 from html.entities import entitydefs as htmlents 0065 except ImportError: 0066 print('Python 2.5+ required') 0067 raise 0068 0069 try: 0070 # For Python 3.0 and later 0071 from urllib.request import urlopen 0072 except ImportError: 0073 # Fall back to Python 2's urllib2 0074 from urllib2 import urlopen 0075 0076 XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>""" 0077 DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">""" 0078 0079 VERSION = "0.7.3" 0080 0081 def genMD5(): 0082 float = random.random() 0083 return hashlib.md5(str(float)).hexdigest() 0084 0085 class BasicTellicoDOM: 0086 def __init__(self): 0087 self.__doc = xml.dom.minidom.Document() 0088 self.__root = self.__doc.createElement('tellico') 0089 self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/') 0090 self.__root.setAttribute('syntaxVersion', '9') 0091 0092 self.__collection = self.__doc.createElement('collection') 0093 self.__collection.setAttribute('title', 'My Movies') 0094 self.__collection.setAttribute('type', '3') 0095 0096 self.__fields = self.__doc.createElement('fields') 0097 # Add all default (standard) fields 0098 self.__dfltField = self.__doc.createElement('field') 0099 self.__dfltField.setAttribute('name', '_default') 0100 0101 # Add a custom 'Collection' field 0102 self.__customField = self.__doc.createElement('field') 0103 self.__customField.setAttribute('name', 'titre-original') 0104 self.__customField.setAttribute('title', 'Original Title') 0105 self.__customField.setAttribute('flags', '0') 0106 self.__customField.setAttribute('category', unicode('Général', 'latin-1').encode('utf-8')) 0107 self.__customField.setAttribute('format', '1') 0108 self.__customField.setAttribute('type', '1') 0109 self.__customField.setAttribute('i18n', 'yes') 0110 0111 self.__fields.appendChild(self.__dfltField) 0112 self.__fields.appendChild(self.__customField) 0113 self.__collection.appendChild(self.__fields) 0114 0115 self.__images = self.__doc.createElement('images') 0116 0117 self.__root.appendChild(self.__collection) 0118 self.__doc.appendChild(self.__root) 0119 0120 # Current movie id 0121 self.__currentId = 0 0122 0123 0124 def addEntry(self, movieData): 0125 """ 0126 Add a movie entry 0127 """ 0128 d = movieData 0129 entryNode = self.__doc.createElement('entry') 0130 entryNode.setAttribute('id', str(self.__currentId)) 0131 0132 titleNode = self.__doc.createElement('title') 0133 titleNode.appendChild(self.__doc.createTextNode(d['title'])) 0134 0135 otitleNode = self.__doc.createElement('titre-original') 0136 otitleNode.appendChild(self.__doc.createTextNode(d['otitle'])) 0137 0138 yearNode = self.__doc.createElement('year') 0139 yearNode.appendChild(self.__doc.createTextNode(d['year'])) 0140 0141 genresNode = self.__doc.createElement('genres') 0142 for g in d['genres']: 0143 genreNode = self.__doc.createElement('genre') 0144 genreNode.appendChild(self.__doc.createTextNode(g)) 0145 genresNode.appendChild(genreNode) 0146 0147 studsNode = self.__doc.createElement('studios') 0148 for g in d['studio']: 0149 studNode = self.__doc.createElement('studio') 0150 studNode.appendChild(self.__doc.createTextNode(g)) 0151 studsNode.appendChild(studNode) 0152 0153 natsNode = self.__doc.createElement('nationalitys') 0154 for g in d['nat']: 0155 natNode = self.__doc.createElement('nationality') 0156 natNode.appendChild(self.__doc.createTextNode(g)) 0157 natsNode.appendChild(natNode) 0158 0159 castsNode = self.__doc.createElement('casts') 0160 i = 0 0161 while i < len(d['actors']): 0162 g = d['actors'][i] 0163 h = d['actors'][i+1] 0164 castNode = self.__doc.createElement('cast') 0165 col1Node = self.__doc.createElement('column') 0166 col2Node = self.__doc.createElement('column') 0167 col1Node.appendChild(self.__doc.createTextNode(g)) 0168 col2Node.appendChild(self.__doc.createTextNode(h)) 0169 castNode.appendChild(col1Node) 0170 castNode.appendChild(col2Node) 0171 castsNode.appendChild(castNode) 0172 i = i + 2 0173 0174 dirsNode = self.__doc.createElement('directors') 0175 for g in d['dirs']: 0176 dirNode = self.__doc.createElement('director') 0177 dirNode.appendChild(self.__doc.createTextNode(g)) 0178 dirsNode.appendChild(dirNode) 0179 0180 prodsNode = self.__doc.createElement('producers') 0181 for g in d['prods']: 0182 prodNode = self.__doc.createElement('producer') 0183 prodNode.appendChild(self.__doc.createTextNode(g)) 0184 prodsNode.appendChild(prodNode) 0185 0186 scensNode = self.__doc.createElement('writers') 0187 for g in d['scens']: 0188 scenNode = self.__doc.createElement('writer') 0189 scenNode.appendChild(self.__doc.createTextNode(g)) 0190 scensNode.appendChild(scenNode) 0191 0192 compsNode = self.__doc.createElement('composers') 0193 for g in d['comps']: 0194 compNode = self.__doc.createElement('composer') 0195 compNode.appendChild(self.__doc.createTextNode(g)) 0196 compsNode.appendChild(compNode) 0197 0198 timeNode = self.__doc.createElement('running-time') 0199 timeNode.appendChild(self.__doc.createTextNode(d['time'])) 0200 0201 allocineNode = self.__doc.createElement(unicode('allociné-link', 'latin-1').encode('utf-8')) 0202 allocineNode.appendChild(self.__doc.createTextNode(d['allocine'])) 0203 0204 plotNode = self.__doc.createElement('plot') 0205 plotNode.appendChild(self.__doc.createTextNode(d['plot'])) 0206 0207 if d['image']: 0208 imageNode = self.__doc.createElement('image') 0209 imageNode.setAttribute('format', 'JPEG') 0210 imageNode.setAttribute('id', d['image'][0]) 0211 imageNode.setAttribute('width', '120') 0212 imageNode.setAttribute('height', '160') 0213 imageNode.appendChild(self.__doc.createTextNode(d['image'][1])) 0214 0215 coverNode = self.__doc.createElement('cover') 0216 coverNode.appendChild(self.__doc.createTextNode(d['image'][0])) 0217 0218 for name in ( 'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'studsNode', 'natsNode', 0219 'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode', 0220 'prodsNode', 'compsNode', 'scensNode' ): 0221 entryNode.appendChild(eval(name)) 0222 0223 if d['image']: 0224 entryNode.appendChild(coverNode) 0225 self.__images.appendChild(imageNode) 0226 0227 self.__collection.appendChild(entryNode) 0228 self.__currentId += 1 0229 0230 def printXML(self): 0231 """ 0232 Outputs XML content to stdout 0233 """ 0234 self.__collection.appendChild(self.__images) 0235 print(XML_HEADER); 0236 print(DOCTYPE) 0237 print(self.__root.toxml()) 0238 0239 0240 class AlloCineParser: 0241 def __init__(self): 0242 self.__baseURL = 'http://www.allocine.fr' 0243 self.__basePath = '/film/fichefilm_gen_cfilm' 0244 self.__castPath = '/film/casting_gen_cfilm' 0245 self.__searchURL= 'http://www.allocine.fr/recherche/?q=%s' 0246 self.__movieURL = self.__baseURL + self.__basePath 0247 self.__castURL = self.__baseURL + self.__castPath 0248 0249 # Define some regexps 0250 self.__regExps = { 0251 'title' : '<div id="title.*?<span.*?>(?P<title>.+?)</span>', 0252 'dirs' : """alis.*?par.*?<a.*?><span.*?>(?P<step1>.+?)</span></a>""", 0253 'nat' : 'Nationalit.*?</span>(?P<nat>.+?)</td', 0254 'genres' : '<span class="lighten">.*?Genre.*?</span>(?P<step1>.+?)</td', 0255 'studio' : 'Distributeur</div>(?P<step1>.+?)</td', 0256 'time' : 'Dur.*?e *?:*?.*?(?P<hours>[0-9])h *(?P<mins>[0-9]*).*?Ann', 0257 'year' : 'Ann.*?e de production.*?<span.*?>(?P<year>[0-9]{4})</span>', 0258 'otitle' : 'Titre original *?:*?.*?<td>(?P<otitle>.+?)</td>', 0259 'plot' : '<p itemprop="description">(?P<plot>.*?)</p>', 0260 'image' : '<div class="poster">.*?<img src=\'(?P<image>http://.+?)\'.?', 0261 } 0262 0263 self.__castRegExps = { 0264 # 'roleactor' : '<li.*?itemprop="actors".*?>.*?<span itemprop="name">(.*?)</span>.*?<p>.*?R.*?le : (?P<role>.*?)</p>.*?</li>', 0265 'roleactor' : '<li.*?\/personne\/.*?">(.*?)</span>.*?<p.*?R.*?le : (?P<role>.*?)</p>.*?</li', 0266 'prods' : '<td>[\r\n\t]*Producteur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>', 0267 'scens' : '<td>[\r\n\t]*Sc.*?nariste[\r\n\t]*</td>.*?<span.*?>(.*?)</span>', 0268 'comps' : '<td>[\r\n\t]*Compositeur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>', 0269 } 0270 0271 self.__domTree = BasicTellicoDOM() 0272 0273 def run(self, title): 0274 """ 0275 Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree 0276 to stdout (in tellico format) so that tellico can use it. 0277 """ 0278 # the script needs the search string to be encoded in utf-8 0279 try: 0280 # first try system encoding 0281 title = unicode(title, sys.stdin.encoding or sys.getdefaultencoding()) 0282 except UnicodeDecodeError: 0283 # on failure, fallback to 'latin-1' 0284 title = unicode(title, 'latin-1') 0285 0286 # now encode for urllib 0287 title = title.encode('utf-8') 0288 self.__getMovie(title) 0289 # Print results to stdout 0290 self.__domTree.printXML() 0291 0292 def __getHTMLContent(self, url): 0293 """ 0294 Fetch HTML data from url 0295 """ 0296 0297 u = urlopen(url) 0298 self.__data = u.read() 0299 u.close() 0300 0301 def __fetchMovieLinks(self, title): 0302 """ 0303 Retrieve all links related to movie 0304 @param title Movie title 0305 """ 0306 tmp = re.findall("""<td.*?class=['"]totalwidth['"]>.*?<a *href=['"]%s=(?P<page>.*?\.html?)['"] *?>(?P<title>.*?)</a>""" % self.__basePath, self.__data, re.S | re.I) 0307 matchList = [] 0308 for match in tmp: 0309 name = re.sub(r'([\r\n]+|<b>|</b>)', '', match[1]) 0310 name = re.sub(r'<.*?>', '', name) 0311 name = re.sub(r'^ *', '', name) 0312 #if re.search(title, name, re.I): 0313 if len(name) > 0: 0314 matchList.append((match[0], name)) 0315 0316 if not matchList: return None 0317 return matchList 0318 0319 def __fetchMovieInfo(self, url, url2): 0320 """ 0321 Looks for movie information 0322 """ 0323 self.__getHTMLContent(url) 0324 matches = data = {} 0325 0326 for name, regexp in self.__regExps.iteritems(): 0327 matches[name] = re.search(regexp, self.__data, re.S | re.I) 0328 0329 if matches[name]: 0330 if name == 'title': 0331 data[name] = matches[name].group('title').strip() 0332 elif name == 'dirs': 0333 dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',') 0334 data[name] = [] 0335 for d in dirsList: 0336 data[name].append(d.strip()) 0337 0338 elif name == 'nat': 0339 natList = re.findall(r'<span class=".*?">(.*?)</span>', matches[name].group('nat'), re.DOTALL) 0340 data[name] = [] 0341 for d in natList: 0342 data[name].append(d.strip().capitalize()) 0343 0344 elif name == 'genres': 0345 genresList = re.findall(r'<span itemprop="genre">(.*?)</span>', matches[name].group('step1'), re.DOTALL) 0346 data[name] = [] 0347 for d in genresList: 0348 data[name].append(d.strip().capitalize()) 0349 0350 elif name == 'studio': 0351 studiosList = re.findall(r'<span itemprop="productionCompany">(.*?)</span>', matches[name].group('step1')) 0352 data[name] = [] 0353 for d in studiosList: 0354 data[name].append(d.strip()) 0355 0356 elif name == 'time': 0357 h, m = matches[name].group('hours'), matches[name].group('mins') 0358 if len(m) == 0: 0359 m = 0 0360 totmin = int(h)*60+int(m) 0361 data[name] = str(totmin) 0362 0363 elif name == 'year': 0364 data[name] = matches[name].group('year').strip() 0365 0366 elif name == 'otitle': 0367 otitle = re.sub(r'([\r\n]+|<em>|</em>)', '', matches[name].group('otitle')) 0368 data[name] = otitle.strip() 0369 0370 elif name == 'plot': 0371 data[name] = matches[name].group('plot').strip() 0372 # Cleans up any HTML entities 0373 data[name] = self.__cleanUp(data[name]) 0374 0375 else: 0376 matches[name] = '' 0377 0378 # Image check 0379 try: 0380 imgtmp = re.findall(self.__regExps['image'], self.__data, re.S | re.I) 0381 matches['image'] = imgtmp[0] 0382 0383 # Save image to a temporary folder 0384 md5 = genMD5() 0385 imObj = urlopen(matches['image'].strip()) 0386 img = imObj.read() 0387 imObj.close() 0388 imgPath = "/tmp/%s.jpeg" % md5 0389 f = open(imgPath, 'w') 0390 f.write(img) 0391 f.close() 0392 0393 # Base64 encoding 0394 data['image'] = (md5 + '.jpeg', base64.encodestring(img)) 0395 0396 # Delete temporary image 0397 os.remove(imgPath) 0398 except: 0399 data['image'] = None 0400 0401 # Now looks for casting information 0402 self.__getHTMLContent(url2) 0403 page = self.__data.split('\n') 0404 0405 d = zone = 0 0406 data['actors'] = [] 0407 data['prods'] = [] 0408 data['scens'] = [] 0409 data['comps'] = [] 0410 0411 # Actors 0412 subset = re.search(r'Acteurs et actrices.*$', self.__data, re.S | re.I) 0413 if not subset: return data 0414 subset = subset.group(0) 0415 #print subset 0416 roleactor = re.findall(self.__castRegExps['roleactor'], subset, re.S | re.I) 0417 for ra in roleactor: 0418 #print ra 0419 data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[0])) 0420 data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[1])) 0421 0422 # Producers, Scenarists, Composers 0423 for kind in ('prods', 'scens', 'comps'): 0424 data[kind] = [re.sub(r'([\r\n\t]+)', '', k).strip() for k in re.findall(self.__castRegExps[kind], subset, re.S | re.I)] 0425 0426 return data 0427 0428 def __cleanUp(self, data): 0429 """ 0430 Cleans up the string(s), replacing raw HTML entities with their 0431 ISO Latin-1 replacement text. 0432 @param data string or list of strings 0433 """ 0434 if type(data) == types.ListType: 0435 for s in data: 0436 for k, v in htmlents.entitydefs.iteritems(): 0437 s = s.replace("&%s;" % k, v) 0438 elif type(data) == types.StringType or type(data) == types.UnicodeType: 0439 for k, v in htmlents.entitydefs.iteritems(): 0440 data = data.replace("&%s;" % k, v) 0441 return data 0442 0443 def __getMovie(self, title): 0444 if not len(title): return 0445 0446 self.__title = title 0447 self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title)) 0448 0449 # Get all links 0450 links = self.__fetchMovieLinks(title) 0451 0452 # Now retrieve info 0453 if links: 0454 for entry in links: 0455 data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]), url2 = "%s=%s" % (self.__castURL, entry[0]) ) 0456 # Add allocine link (custom field) 0457 data['allocine'] = "%s=%s" % (self.__movieURL, entry[0]) 0458 self.__domTree.addEntry(data) 0459 else: 0460 return None 0461 0462 0463 def showUsage(): 0464 print("Usage: %s movietitle" % sys.argv[0]) 0465 sys.exit(1) 0466 0467 def main(): 0468 if len(sys.argv) < 2: 0469 showUsage() 0470 0471 parser = AlloCineParser() 0472 parser.run(sys.argv[1]) 0473 0474 if __name__ == '__main__': 0475 main()