fetch/scripts/fr.allocine.py

0001 #!/usr/bin/env python
0002 # -*- coding: iso-8859-1 -*-
0003 # kate: replace-tabs off;
0004 # ***************************************************************************
0005 #    copyright            : (C) 2006-2010 by Mathias Monnerville
0006 #    email                : tellico@monnerville.com
0007 # ***************************************************************************
0008 #
0009 # ***************************************************************************
0010 # *                                                                         *
0011 # *   This program is free software; you can redistribute it and/or modify  *
0012 # *   it under the terms of version 2 of the GNU General Public License as  *
0013 # *   published by the Free Software Foundation;                            *
0014 # *                                                                         *
0015 # ***************************************************************************
0016 #
0017 # Version 0.7.3: 2010-12-07 (Reported by Romain Henriet)
0018 # * Fixed some regexp issues
0019 # * Better handling of image parsing/fetching errors
0020 #
0021 # Version 0.7.2.1: 2010-07-27 (Reported by Romain Henriet)
0022 # * Updated title match to allow searching without diacritical marks
0023 #
0024 # Version 0.7.2: 2010-05-27 (Reported by Romain Henriet)
0025 # * Fixed bug preventing searches with accent marks
0026 # * Added post-processing cleanup action to replace raw HTML entities with
0027 #   their ISO Latin-1 replacement text
0028 #
0029 # Version 0.7.1: 2010-04-26 (Thanks to Romain Henriet <romain-devel@laposte.net>)
0030 # * Fixed greedy regexp for genre.  Fixed nationality output. Add studio.
0031 #
0032 # Version 0.7: 2009-11-12
0033 # * Allocine has a brand new website. All regexps were broken.
0034 #
0035 # Version 0.6: 2009-03-04 (Thanks to R. Fischer and Henry-Nicolas Tourneur)
0036 # * Fixed parsing issues (various RegExp issues due to allocine's HTML changes)
0037 #
0038 # Version 0.5: 2009-01-21 (Changes contributed by R. Fischer <fischer.tellico@free.fr>)
0039 # * Added complete distribution of actors and roles, Genres, Nationalities, producers, composer and scenarist
0040 # * Fixed the plot field that returned a wrong answer when no plot is available
0041 # * Fixed a bug related to parameters encoding
0042 #
0043 # Version 0.4:
0044 # * Fixed parsing errors: some fields in allocine's HTML pages have changed recently. Multiple actors and genres
0045 # could not be retrieved. Fixed bad http request error due to some changes in HTML code.
0046 #
0047 # Version 0.3:
0048 # * Fixed parsing: some fields in allocine's HTML pages have changed. Movie's image could not be fetched anymore. Fixed.
0049 #
0050 # Version 0.2:
0051 # * Fixed parsing: allocine's HTML pages have changed. Movie's image could not be fetched anymore.
0052 #
0053 # Version 0.1:
0054 # * Initial release.
0055
0056 import sys, os, re, hashlib, random, types
0057 import urllib, time, base64
0058 import xml.dom.minidom
0059 import locale
0060 try:
0061     import htmlentitydefs as htmlents
0062 except ImportError:
0063     try:
0064         from html.entities import entitydefs as htmlents
0065     except ImportError:
0066         print('Python 2.5+ required')
0067         raise
0068
0069 try:
0070     # For Python 3.0 and later
0071     from urllib.request import urlopen
0072 except ImportError:
0073     # Fall back to Python 2's urllib2
0074     from urllib2 import urlopen
0075
0076 XML_HEADER = """<?xml version="1.0" encoding="UTF-8"?>"""
0077 DOCTYPE = """<!DOCTYPE tellico PUBLIC "-//Robby Stephenson/DTD Tellico V9.0//EN" "http://periapsis.org/tellico/dtd/v9/tellico.dtd">"""
0078
0079 VERSION = "0.7.3"
0080
0081 def genMD5():
0082     float = random.random()
0083     return hashlib.md5(str(float)).hexdigest()
0084
0085 class BasicTellicoDOM:
0086     def __init__(self):
0087         self.__doc = xml.dom.minidom.Document()
0088         self.__root = self.__doc.createElement('tellico')
0089         self.__root.setAttribute('xmlns', 'http://periapsis.org/tellico/')
0090         self.__root.setAttribute('syntaxVersion', '9')
0091
0092         self.__collection = self.__doc.createElement('collection')
0093         self.__collection.setAttribute('title', 'My Movies')
0094         self.__collection.setAttribute('type', '3')
0095
0096         self.__fields = self.__doc.createElement('fields')
0097         # Add all default (standard) fields
0098         self.__dfltField = self.__doc.createElement('field')
0099         self.__dfltField.setAttribute('name', '_default')
0100
0101         # Add a custom 'Collection' field
0102         self.__customField = self.__doc.createElement('field')
0103         self.__customField.setAttribute('name', 'titre-original')
0104         self.__customField.setAttribute('title', 'Original Title')
0105         self.__customField.setAttribute('flags', '0')
0106         self.__customField.setAttribute('category', unicode('G�n�ral', 'latin-1').encode('utf-8'))
0107         self.__customField.setAttribute('format', '1')
0108         self.__customField.setAttribute('type', '1')
0109         self.__customField.setAttribute('i18n', 'yes')
0110
0111         self.__fields.appendChild(self.__dfltField)
0112         self.__fields.appendChild(self.__customField)
0113         self.__collection.appendChild(self.__fields)
0114
0115         self.__images = self.__doc.createElement('images')
0116
0117         self.__root.appendChild(self.__collection)
0118         self.__doc.appendChild(self.__root)
0119
0120         # Current movie id
0121         self.__currentId = 0
0122
0123
0124     def addEntry(self, movieData):
0125         """
0126         Add a movie entry
0127         """
0128         d = movieData
0129         entryNode = self.__doc.createElement('entry')
0130         entryNode.setAttribute('id', str(self.__currentId))
0131
0132         titleNode = self.__doc.createElement('title')
0133         titleNode.appendChild(self.__doc.createTextNode(d['title']))
0134
0135         otitleNode = self.__doc.createElement('titre-original')
0136         otitleNode.appendChild(self.__doc.createTextNode(d['otitle']))
0137
0138         yearNode = self.__doc.createElement('year')
0139         yearNode.appendChild(self.__doc.createTextNode(d['year']))
0140
0141         genresNode = self.__doc.createElement('genres')
0142         for g in d['genres']:
0143             genreNode = self.__doc.createElement('genre')
0144             genreNode.appendChild(self.__doc.createTextNode(g))
0145             genresNode.appendChild(genreNode)
0146
0147         studsNode = self.__doc.createElement('studios')
0148         for g in d['studio']:
0149             studNode = self.__doc.createElement('studio')
0150             studNode.appendChild(self.__doc.createTextNode(g))
0151             studsNode.appendChild(studNode)
0152
0153         natsNode = self.__doc.createElement('nationalitys')
0154         for g in d['nat']:
0155             natNode = self.__doc.createElement('nationality')
0156             natNode.appendChild(self.__doc.createTextNode(g))
0157             natsNode.appendChild(natNode)
0158
0159         castsNode = self.__doc.createElement('casts')
0160         i = 0
0161         while i < len(d['actors']):
0162             g = d['actors'][i]
0163             h = d['actors'][i+1]
0164             castNode = self.__doc.createElement('cast')
0165             col1Node = self.__doc.createElement('column')
0166             col2Node = self.__doc.createElement('column')
0167             col1Node.appendChild(self.__doc.createTextNode(g))
0168             col2Node.appendChild(self.__doc.createTextNode(h))
0169             castNode.appendChild(col1Node)
0170             castNode.appendChild(col2Node)
0171             castsNode.appendChild(castNode)
0172             i = i + 2
0173
0174         dirsNode = self.__doc.createElement('directors')
0175         for g in d['dirs']:
0176             dirNode = self.__doc.createElement('director')
0177             dirNode.appendChild(self.__doc.createTextNode(g))
0178             dirsNode.appendChild(dirNode)
0179
0180         prodsNode = self.__doc.createElement('producers')
0181         for g in d['prods']:
0182             prodNode = self.__doc.createElement('producer')
0183             prodNode.appendChild(self.__doc.createTextNode(g))
0184             prodsNode.appendChild(prodNode)
0185
0186         scensNode = self.__doc.createElement('writers')
0187         for g in d['scens']:
0188             scenNode = self.__doc.createElement('writer')
0189             scenNode.appendChild(self.__doc.createTextNode(g))
0190             scensNode.appendChild(scenNode)
0191
0192         compsNode = self.__doc.createElement('composers')
0193         for g in d['comps']:
0194             compNode = self.__doc.createElement('composer')
0195             compNode.appendChild(self.__doc.createTextNode(g))
0196             compsNode.appendChild(compNode)
0197
0198         timeNode = self.__doc.createElement('running-time')
0199         timeNode.appendChild(self.__doc.createTextNode(d['time']))
0200
0201         allocineNode = self.__doc.createElement(unicode('allocin�-link', 'latin-1').encode('utf-8'))
0202         allocineNode.appendChild(self.__doc.createTextNode(d['allocine']))
0203
0204         plotNode = self.__doc.createElement('plot')
0205         plotNode.appendChild(self.__doc.createTextNode(d['plot']))
0206
0207         if d['image']:
0208             imageNode = self.__doc.createElement('image')
0209             imageNode.setAttribute('format', 'JPEG')
0210             imageNode.setAttribute('id', d['image'][0])
0211             imageNode.setAttribute('width', '120')
0212             imageNode.setAttribute('height', '160')
0213             imageNode.appendChild(self.__doc.createTextNode(d['image'][1]))
0214
0215             coverNode = self.__doc.createElement('cover')
0216             coverNode.appendChild(self.__doc.createTextNode(d['image'][0]))
0217
0218         for name in (   'titleNode', 'otitleNode', 'yearNode', 'genresNode', 'studsNode', 'natsNode',
0219                         'castsNode', 'dirsNode', 'timeNode', 'allocineNode', 'plotNode',
0220                         'prodsNode', 'compsNode', 'scensNode' ):
0221             entryNode.appendChild(eval(name))
0222
0223         if d['image']:
0224             entryNode.appendChild(coverNode)
0225             self.__images.appendChild(imageNode)
0226
0227         self.__collection.appendChild(entryNode)
0228         self.__currentId += 1
0229
0230     def printXML(self):
0231         """
0232         Outputs XML content to stdout
0233         """
0234         self.__collection.appendChild(self.__images)
0235         print(XML_HEADER);
0236         print(DOCTYPE)
0237         print(self.__root.toxml())
0238
0239
0240 class AlloCineParser:
0241     def __init__(self):
0242         self.__baseURL  = 'http://www.allocine.fr'
0243         self.__basePath = '/film/fichefilm_gen_cfilm'
0244         self.__castPath = '/film/casting_gen_cfilm'
0245         self.__searchURL= 'http://www.allocine.fr/recherche/?q=%s'
0246         self.__movieURL = self.__baseURL + self.__basePath
0247         self.__castURL = self.__baseURL + self.__castPath
0248
0249         # Define some regexps
0250         self.__regExps = {
0251             'title'     : '<div id="title.*?<span.*?>(?P<title>.+?)</span>',
0252             'dirs'      : """alis.*?par.*?<a.*?><span.*?>(?P<step1>.+?)</span></a>""",
0253             'nat'       : 'Nationalit.*?</span>(?P<nat>.+?)</td',
0254             'genres'    : '<span class="lighten">.*?Genre.*?</span>(?P<step1>.+?)</td',
0255             'studio'    : 'Distributeur</div>(?P<step1>.+?)</td',
0256             'time'      : 'Dur.*?e *?:*?.*?(?P<hours>[0-9])h *(?P<mins>[0-9]*).*?Ann',
0257             'year'      : 'Ann.*?e de production.*?<span.*?>(?P<year>[0-9]{4})</span>',
0258             'otitle'    : 'Titre original *?:*?.*?<td>(?P<otitle>.+?)</td>',
0259             'plot'      : '<p itemprop="description">(?P<plot>.*?)</p>',
0260             'image'     : '<div class="poster">.*?<img src=\'(?P<image>http://.+?)\'.?',
0261         }
0262
0263         self.__castRegExps = {
0264 #           'roleactor'     : '<li.*?itemprop="actors".*?>.*?<span itemprop="name">(.*?)</span>.*?<p>.*?R.*?le : (?P<role>.*?)</p>.*?</li>',
0265             'roleactor'     : '<li.*?\/personne\/.*?">(.*?)</span>.*?<p.*?R.*?le : (?P<role>.*?)</p>.*?</li',
0266             'prods'           : '<td>[\r\n\t]*Producteur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>',
0267             'scens'           : '<td>[\r\n\t]*Sc.*?nariste[\r\n\t]*</td>.*?<span.*?>(.*?)</span>',
0268             'comps'           : '<td>[\r\n\t]*Compositeur[\r\n\t]*</td>.*?<span.*?>(.*?)</span>',
0269         }
0270
0271         self.__domTree = BasicTellicoDOM()
0272
0273     def run(self, title):
0274         """
0275         Runs the allocine.fr parser: fetch movie related links, then fills and prints the DOM tree
0276         to stdout (in tellico format) so that tellico can use it.
0277         """
0278         # the script needs the search string to be encoded in utf-8
0279         try:
0280             # first try system encoding
0281             title = unicode(title, sys.stdin.encoding or sys.getdefaultencoding())
0282         except UnicodeDecodeError:
0283             # on failure, fallback to 'latin-1'
0284             title = unicode(title, 'latin-1')
0285
0286         # now encode for urllib
0287         title = title.encode('utf-8')
0288         self.__getMovie(title)
0289         # Print results to stdout
0290         self.__domTree.printXML()
0291
0292     def __getHTMLContent(self, url):
0293         """
0294         Fetch HTML data from url
0295         """
0296
0297         u = urlopen(url)
0298         self.__data = u.read()
0299         u.close()
0300
0301     def __fetchMovieLinks(self, title):
0302         """
0303         Retrieve all links related to movie
0304         @param title Movie title
0305         """
0306         tmp = re.findall("""<td.*?class=['"]totalwidth['"]>.*?<a *href=['"]%s=(?P<page>.*?\.html?)['"] *?>(?P<title>.*?)</a>""" % self.__basePath, self.__data, re.S | re.I)
0307         matchList = []
0308         for match in tmp:
0309             name = re.sub(r'([\r\n]+|<b>|</b>)', '', match[1])
0310             name = re.sub(r'<.*?>', '', name)
0311             name = re.sub(r'^ *', '', name)
0312             #if re.search(title, name, re.I):
0313             if len(name) > 0:
0314                 matchList.append((match[0], name))
0315
0316         if not matchList: return None
0317         return matchList
0318
0319     def __fetchMovieInfo(self, url, url2):
0320         """
0321         Looks for movie information
0322         """
0323         self.__getHTMLContent(url)
0324         matches = data = {}
0325
0326         for name, regexp in self.__regExps.iteritems():
0327             matches[name] = re.search(regexp, self.__data, re.S | re.I)
0328
0329             if matches[name]:
0330                 if name == 'title':
0331                     data[name] = matches[name].group('title').strip()
0332                 elif name == 'dirs':
0333                     dirsList = re.sub('</?a.*?>', '', matches[name].group('step1')).split(',')
0334                     data[name] = []
0335                     for d in dirsList:
0336                         data[name].append(d.strip())
0337
0338                 elif name == 'nat':
0339                     natList = re.findall(r'<span class=".*?">(.*?)</span>', matches[name].group('nat'), re.DOTALL)
0340                     data[name] = []
0341                     for d in natList:
0342                         data[name].append(d.strip().capitalize())
0343
0344                 elif name == 'genres':
0345                     genresList = re.findall(r'<span itemprop="genre">(.*?)</span>', matches[name].group('step1'), re.DOTALL)
0346                     data[name] = []
0347                     for d in genresList:
0348                         data[name].append(d.strip().capitalize())
0349
0350                 elif name == 'studio':
0351                     studiosList = re.findall(r'<span itemprop="productionCompany">(.*?)</span>', matches[name].group('step1'))
0352                     data[name] = []
0353                     for d in studiosList:
0354                         data[name].append(d.strip())
0355
0356                 elif name == 'time':
0357                     h, m = matches[name].group('hours'), matches[name].group('mins')
0358                     if len(m) == 0:
0359                         m = 0
0360                     totmin = int(h)*60+int(m)
0361                     data[name] = str(totmin)
0362
0363                 elif name == 'year':
0364                     data[name] = matches[name].group('year').strip()
0365
0366                 elif name == 'otitle':
0367                     otitle = re.sub(r'([\r\n]+|<em>|</em>)', '', matches[name].group('otitle'))
0368                     data[name] = otitle.strip()
0369
0370                 elif name == 'plot':
0371                     data[name] = matches[name].group('plot').strip()
0372                 # Cleans up any HTML entities
0373                 data[name] = self.__cleanUp(data[name])
0374
0375             else:
0376                 matches[name] = ''
0377
0378         # Image check
0379         try:
0380             imgtmp = re.findall(self.__regExps['image'], self.__data, re.S | re.I)
0381             matches['image'] = imgtmp[0]
0382
0383             # Save image to a temporary folder
0384             md5 = genMD5()
0385             imObj = urlopen(matches['image'].strip())
0386             img = imObj.read()
0387             imObj.close()
0388             imgPath = "/tmp/%s.jpeg" % md5
0389             f = open(imgPath, 'w')
0390             f.write(img)
0391             f.close()
0392
0393             # Base64 encoding
0394             data['image'] = (md5 + '.jpeg', base64.encodestring(img))
0395
0396             # Delete temporary image
0397             os.remove(imgPath)
0398         except:
0399             data['image'] = None
0400
0401         # Now looks for casting information
0402         self.__getHTMLContent(url2)
0403         page = self.__data.split('\n')
0404
0405         d = zone = 0
0406         data['actors'] = []
0407         data['prods'] = []
0408         data['scens'] = []
0409         data['comps'] = []
0410
0411         # Actors
0412         subset = re.search(r'Acteurs et actrices.*$', self.__data, re.S | re.I)
0413         if not subset: return data
0414         subset = subset.group(0)
0415                 #print subset
0416         roleactor = re.findall(self.__castRegExps['roleactor'], subset, re.S | re.I)
0417         for ra in roleactor:
0418                         #print ra
0419             data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[0]))
0420             data['actors'].append(re.sub(r'([\r\n\t]+)', '', ra[1]))
0421
0422         # Producers, Scenarists, Composers
0423         for kind in ('prods', 'scens', 'comps'):
0424             data[kind] = [re.sub(r'([\r\n\t]+)', '', k).strip() for k in re.findall(self.__castRegExps[kind], subset, re.S | re.I)]
0425
0426         return data
0427
0428     def __cleanUp(self, data):
0429         """
0430         Cleans up the string(s), replacing raw HTML entities with their
0431         ISO Latin-1 replacement text.
0432         @param data string or list of strings
0433         """
0434         if type(data) == types.ListType:
0435             for s in data:
0436                 for k, v in htmlents.entitydefs.iteritems():
0437                     s = s.replace("&%s;" % k, v)
0438         elif type(data) == types.StringType or type(data) == types.UnicodeType:
0439             for k, v in htmlents.entitydefs.iteritems():
0440                 data = data.replace("&%s;" % k, v)
0441         return data
0442
0443     def __getMovie(self, title):
0444         if not len(title): return
0445
0446         self.__title = title
0447         self.__getHTMLContent(self.__searchURL % urllib.quote(self.__title))
0448
0449         # Get all links
0450         links = self.__fetchMovieLinks(title)
0451
0452         # Now retrieve info
0453         if links:
0454             for entry in links:
0455                 data = self.__fetchMovieInfo( url = "%s=%s" % (self.__movieURL, entry[0]), url2 = "%s=%s" % (self.__castURL, entry[0]) )
0456                 # Add allocine link (custom field)
0457                 data['allocine'] = "%s=%s" % (self.__movieURL, entry[0])
0458                 self.__domTree.addEntry(data)
0459         else:
0460             return None
0461
0462
0463 def showUsage():
0464     print("Usage: %s movietitle" % sys.argv[0])
0465     sys.exit(1)
0466
0467 def main():
0468     if len(sys.argv) < 2:
0469         showUsage()
0470
0471     parser = AlloCineParser()
0472     parser.run(sys.argv[1])
0473
0474 if __name__ == '__main__':
0475     main()