-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIMDB_retriever.py
455 lines (374 loc) · 18.7 KB
/
IMDB_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
import json
import logging
import lxml
from google.appengine.api import memcache
from models import Movie, Artist, TasteMovie, TasteArtist, TasteGenre
from utilities import get, RetrieverError, BASE_URL_MYAPIFILMS, GENRES, clear_url
def retrieve_movie_from_title(movie_original_title, movie_director, movie_cast, movie_title=None, movie_url=None,
movie_year=None, movie_genre=None):
"""
Retrieve movie info from IMDB by movie title.
:param movie_title: title of the film to retrieve info
:type movie_title: string
:param movie_original_title: original title of the film to retrieve info
:type movie_original_title: string
:param movie_director: director of the film to retrieve info
:type movie_director: string
:param movie_genre: genre of the film to retrieve info
:type movie_genre: string
:return: Movie's key
:rtype: ndb.Key
:raise RetrieverError: if there is an error from MYAPIFILMS
"""
logging.info('Retrieving %s', movie_original_title)
url = BASE_URL_MYAPIFILMS + 'imdb?title=' + movie_original_title + '&format=JSON&aka=0&business=0&seasons=0&seasonYear=' + movie_year + '&technical=0&filter=M&exactFilter=0&limit=1&lang=en-us&actors=S&biography=0&trailer=1&uniqueName=0&filmography=0&bornDied=0&starSign=0&actorActress=0&actorTrivia=0&movieTrivia=0&awards=0&token=307cccfe-d20b-4b69-b976-d6a024538864'
logging.info('Url My API Films: %s', url)
json_page = get(url).encode('utf-8')
json_data = json.loads(json_page)
if type(json_data) is not list: # If it is not a list there is a problem
logging.info('Movie not found in IMDB.')
for x in range(26, len(movie_url)):
if movie_url[x] == "/":
end = x
break
movie_id = movie_url[26: end]
movie = Movie(id=movie_id,
year=movie_year,
original_title=movie_original_title,
title=movie_title,
genres=[movie_genre])
actors_string = movie_cast
directors_list = movie_director
writers_list = []
#print actors_string
actors_list = []
begin = 0
count = 0
for i in actors_string:
count += 1
if i == "," or count == len(actors_string) - 1:
actors_list.append(actors_string[begin:count - 1])
begin = count + 1
search_artist_from_name(actors_list[len(actors_list) - 1], movie)
for director_name in directors_list:
search_artist_from_name(actors_list[len(actors_list) - 1], movie, director_name)
html_page_plot = get(movie_url).encode('utf-8')
tree = lxml.html.fromstring(html_page_plot)
try:
movie.plot_it = tree.xpath('//article[@class="scheda-desc"]/p/text()')[0]
except IndexError:
logging.error('Impossible to retrieve info from FilmTV')
pass
movie.put()
else:
directors_list = json_data[0]['directors']
#print movie_director
#prova = directors_list[0]['name'].encode('utf-8')
#print prova
if (movie_director in directors_list[0]['name'].encode('utf-8')) or (directors_list[0]['name'].encode('utf-8') in movie_director):
movie = Movie(id=json_data[0]['idIMDB'],
plot=json_data[0]['plot'],
poster=clear_url(json_data[0]['urlPoster']),
rated=json_data[0]['rated'],
simple_plot=json_data[0]['simplePlot'],
genres=json_data[0]['genres'])
try:
trailer_url = json_data[0]['trailer']['videoURL']
movie.trailer = trailer_url
except KeyError:
movie.trailer = None
movie.title = movie_title
movie.original_title = movie_original_title
run_times = json_data[0]['runtime']
if len(run_times) == 0:
movie.run_times = None
else:
movie.run_times = run_times[0]
year = json_data[0]['year']
if len(year) > 4:
year = year[-4:]
movie.year = year
actors_list = json_data[0]['actors']
writers_list = json_data[0]['writers']
retrieve_artists(movie, actors_list, directors_list, writers_list)
logging.info('Url FilmTV: %s', movie_url)
html_page_plot = get(movie_url).encode('utf-8')
tree = lxml.html.fromstring(html_page_plot)
try:
movie.plot_it = tree.xpath('//article[@class="scheda-desc"]/p/text()')[0]
except IndexError:
logging.error('Impossible to retrieve info from FilmTV')
pass
movie.put()
else:
logging.info("FilmTV movie is not the same with retrieved movie in IMDB!")
for x in range(26, len(movie_url)):
if movie_url[x] == "/":
end = x
break
movie_id = movie_url[26: end]
#print movie_id
movie = Movie(id=movie_id,
genres=[movie_genre],
year=movie_year,
original_title=movie_original_title,
title=movie_title)
actors_string = movie_cast
directors_list = movie_director
writers_list = []
#print actors_string
actors_list = []
begin = 0
count = 0
if actors_string is not None:
for i in actors_string:
count += 1
if i == "," or count == len(actors_string) - 1:
actors_list.append(actors_string[begin:count - 1])
begin = count + 1
search_artist_from_name(actors_list[len(actors_list) - 1], movie)
if directors_list is not None:
for director_name in directors_list:
search_artist_from_name(actors_list[len(actors_list) - 1], movie, director_name)
html_page_plot = get(movie_url).encode('utf-8')
tree = lxml.html.fromstring(html_page_plot)
try:
movie.plot_it = tree.xpath('//article[@class="scheda-desc"]/p/text()')[0]
except IndexError:
logging.error('Impossible to retrieve info from FilmTV')
pass
key = movie.put()
logging.info('Retrieved %s', movie_original_title)
return key
def retrieve_movie_from_id(movie_id):
"""
Retrieve movie info from IMDB by movie id.
:param movie_id: original title of the film to retrieve info
:type movie_id: string
:return: Movie's key
:rtype: ndb.Key
:raise RetrieverError: if there is an error from MYAPIFILMS
"""
logging.info('Retrieving %s', movie_id)
url = BASE_URL_MYAPIFILMS + 'imdb?idIMDB=' + movie_id + '&format=JSON&aka=1&business=0&seasons=0&seasonYear=0&technical=0&filter=N&exactFilter=0&limit=1&lang=en-us&actors=S&biography=0&trailer=1&uniqueName=0&filmography=0&bornDied=0&starSign=0&actorActress=0&actorTrivia=0&movieTrivia=0&awards=0&token=307cccfe-d20b-4b69-b976-d6a024538864'
json_page = get(url).encode('utf-8')
json_data = json.loads(json_page)
movie = Movie(id=json_data['idIMDB'],
plot=json_data['plot'],
poster=clear_url(json_data['urlPoster']) if ('urlPoster' in json_data and json_data['urlPoster'] != "") else None,
rated=json_data['rated'],
simple_plot=json_data['simplePlot'],
genres=json_data['genres'])
try:
trailer_url = json_data['trailer']['videoURL']
movie.trailer = trailer_url
except KeyError:
movie.trailer = None
movie.original_title = json_data['title']
akas = json_data['akas']
for aka in akas:
if aka['country'] == 'Italy':
movie.title = aka['title']
run_times = json_data['runtime']
if len(run_times) == 0:
movie.run_times = None
else:
movie.run_times = run_times[0]
year = json_data['year']
if len(year) > 4:
year = year[-4:]
movie.year = year
key = movie.put()
actors_list = json_data['actors']
directors_list = json_data['directors']
writers_list = json_data['writers']
retrieve_artists(movie, actors_list, directors_list, writers_list)
logging.info('Retrieved %s', movie_id)
return key
def retrieve_artists(movie, actors_list, directors_list, writers_list):
"""
Retrieve all artist in a movie from actors, directors and writers lists.
:param movie: Movie object in order to add actors, directors and writers
:type movie: Movie
:param actors_list: list of actors
:type actors_list: list of dict
:param directors_list: list of directors
:type directors_list: list of dict
:param writers_list: list of writers
:type writers_list: list of dict
"""
for json_data in actors_list:
actor = Artist(id=json_data['actorId'],
name=json_data['actorName'],
photo=clear_url(json_data['urlPhoto']))
actor.put()
movie.add_actor(actor)
for json_data in directors_list:
director = Artist(id=json_data['nameId'],
name=json_data['name'])
director.put()
movie.add_director(director)
for json_data in writers_list:
writer = Artist(id=json_data['nameId'],
name=json_data['name'])
writer.put()
movie.add_writer(writer)
def search_artist_from_name(artist_name, movie=None, director_name=None):
"""
Retrieve artist info from IMDB by name of artist.
:param artist_name: name of the actor to retrieve info
:type artist_name: string
:return: Actor's key
:rtype: ndb.Key
:raise RetrieverError: if there is an error from MYAPIFILMS
"""
url = BASE_URL_MYAPIFILMS + 'imdb?name=' + artist_name + '&format=JSON&filmography=0&limit=1&lang=en-us&exactFilter=0&bornDied=0&starSign=0&uniqueName=0&actorActress=0&actorTrivia=0&token=307cccfe-d20b-4b69-b976-d6a024538864'
json_page = get(url).encode('utf-8')
json_data = json.loads(json_page)
if type(json_data) is not list: # If it is not a list there is a problem
raise RetrieverError(json_data['code'], json_data['message'])
try:
photo = clear_url(json_data[0]['urlPhoto'])
except Exception:
logging.info("Photo not found")
photo = "None"
artist = Artist(id=json_data[0]['idIMDB'],
name=json_data[0]['name'],
photo=photo)
if movie is not None:
if director_name is not None:
movie.add_director(artist)
else:
movie.add_actor(artist)
return artist.put()
def retrieve_artist_from_id(artist_id):
"""
Retrieve artist info from IMDB by id of artist.
:param artist_id: id of the artist to retrieve info
:type artist_id: string
:return: Artist's key
:rtype: ndb.Key
:raise RetrieverError: if there is an error from MYAPIFILMS
"""
logging.info('Retrieving %s', artist_id)
url = BASE_URL_MYAPIFILMS + 'imdb?idName=' + artist_id + '&format=JSON&filmography=0&lang=en-us&bornDied=0&starSign=0&uniqueName=0&actorActress=0&actorTrivia=0&actorPhotos=N&actorVideos=N&salary=0&spouses=0&tradeMark=0&personalQuotes=0&token=307cccfe-d20b-4b69-b976-d6a024538864'
json_page = get(url).encode('utf-8')
json_data = json.loads(json_page)
artist = Artist(id=json_data["idIMDB"],
name=json_data["name"],
photo=clear_url(json_data["urlPhoto"]) if ('urlPhoto' in json_data and json_data['urlPhoto'] != "") else None)
return artist.put()
def retrieve_suggest_list(user, query):
json_page = memcache.get(query.lower())
if json_page is None:
url = 'http://sg.media-imdb.com/suggests/' + query[0] + '/' + query + '.json'
json_page = get(url).encode('utf-8')
memcache.add(query, json_page)
value = 'imdb$' + query
movies = []
artists = []
genres = []
try:
json_data = json.loads(json_page[len(value)+1:-1])
data = json_data['d']
for elem in data:
if not elem['id'].startswith('http://'):
if 'q' in elem:
if elem['q'] == "feature":
try:
idIMDB = elem['id']
taste_movie = TasteMovie.get_by_id(idIMDB + user.key.id()) # Get taste
movies.append({"originalTitle": elem['l'].encode('utf-8') if elem['l'] is not None else None,
"title": None,
"idIMDB": idIMDB,
"poster": clear_url(elem['i'][0]) if 'i' in elem else 'null',
"year": str(elem['y']),
"tasted": 1 if (taste_movie is not None and taste_movie.added) else 0})
except KeyError as err:
logging.error("Error in the JSON: %s", err)
else:
try:
idIMDB = elem['id']
taste_artist = TasteArtist.get_by_id(idIMDB + user.key.id())
artists.append({"name": elem['l'].encode('utf-8') if elem['l'] is not None else None,
"idIMDB": elem['id'],
"photo": clear_url(elem['i'][0]) if 'i' in elem else 'null',
"tasted": 1 if (taste_artist is not None and taste_artist.added) else 0})
except KeyError as err:
logging.error("Error in the JSON: %s", err)
else:
logging.info("Error the element is a link: %s", elem['id'])
except ValueError:
pass
[genres.append({"name": genre, "tasted": 1 if (TasteGenre.get_by_id(genre + user.key.id()) is not None and (TasteGenre.get_by_id(genre + user.key.id())).added) else 0}) for genre in GENRES if genre.lower().startswith(query.lower())]
return {"query": query, "movies": movies, "artists": artists, "genres": genres}
def retrieve_search_result_list(user, query):
movies_page = memcache.get("movies" + query.lower())
if movies_page is None:
movies_page = get(BASE_URL_MYAPIFILMS + 'imdb?title=' + query + '&format=JSON&aka=0&business=0&seasons=0&seasonYear=0&technical=0&filter=M&exactFilter=0&limit=5&lang=it-it&actors=N&biography=0&trailer=0&uniqueName=0&filmography=0&bornDied=0&starSign=0&actorActress=0&actorTrivia=0&movieTrivia=0&awards=0&moviePhotos=N&movieVideos=N&token=307cccfe-d20b-4b69-b976-d6a024538864').encode('utf-8')
memcache.add("movies" + query.lower(), movies_page)
artists_page = memcache.get("artists" + query.lower())
if artists_page is None:
artists_page = get(BASE_URL_MYAPIFILMS + 'imdb?name=' + query + '&format=JSON&filmography=0&limit=5&lang=it-it&exactFilter=0&bornDied=0&starSign=0&uniqueName=0&actorActress=0&actorTrivia=0&actorPhotos=N&actorVideos=N&salary=0&spouses=0&tradeMark=0&personalQuotes=0&token=307cccfe-d20b-4b69-b976-d6a024538864').encode('utf-8')
memcache.add("artists" + query, artists_page)
movies = []
artists = []
genres = []
try:
json_data = json.loads(movies_page)
print json_data
if type(json_data) is list: # If it is not a list there is a problem
for elem in json_data:
idIMDB = elem['idIMDB']
taste_movie = TasteMovie.get_by_id(idIMDB + user.key.id()) # Get taste
movies.append({"title": elem['title'].encode('utf-8') if elem['title'] != "" else elem['originalTitle'].encode('utf-8'),
"originalTitle": elem['originalTitle'].encode('utf-8') if elem['originalTitle'] != "" else elem['title'].encode('utf-8'),
"idIMDB": idIMDB,
"poster": clear_url(elem['urlPoster']) if ('urlPoster' in elem and elem['urlPoster'] != "") else None,
"year": str(elem['year']),
"tasted": 1 if (taste_movie is not None and taste_movie.added) else 0})
except ValueError:
pass
try:
json_data = json.loads(artists_page)
print json_data
if type(json_data) is list: # If it is not a list there is a problem
for elem in json_data:
idIMDB = elem['idIMDB']
taste_artist = TasteArtist.get_by_id(idIMDB + user.key.id())
artists.append({"name": elem['name'].encode('utf-8') if elem['name'] != "" else None,
"idIMDB": idIMDB,
"photo": clear_url(elem['urlPhoto']) if ('urlPhoto' in elem and elem['urlPhoto'] != "") else None,
"tasted": 1 if (taste_artist is not None and taste_artist.added) else 0})
except ValueError:
pass
[genres.append({"name": genre, "tasted": 1 if (TasteGenre.get_by_id(genre + user.key.id()) is not None and (TasteGenre.get_by_id(genre + user.key.id())).added) else 0}) for genre in GENRES if genre.lower().startswith(query)]
return {"query": query, "movies": movies, "artists": artists, "genres": genres}
'''
# TODO: next to be finished
def retrieve_movie_title_from_id(movie_id, language):
"""
Retrieve movie info from IMDB by movie id.
:param movie_id: original title of the film to retrieve info
:type movie_id: string
:return: Movie's key
:rtype: ndb.Key
:raise RetrieverError: if there is an error from MYAPIFILMS
"""
logging.info('Retrieving %s', movie_id)
url = BASE_URL_MYAPIFILMS + 'imdb?idIMDB=' + movie_id + '&format=JSON&aka=1&business=0&seasons=0&seasonYear=0&technical=0&filter=N&exactFilter=0&limit=1&lang=en-us&actors=S&biography=0&trailer=0&uniqueName=0&filmography=0&bornDied=0&starSign=0&actorActress=0&actorTrivia=0&movieTrivia=0&awards=0'
json_page = get(url).encode('utf-8')
json_data = json.loads(json_page)
movie = Movie(id=json_data['idIMDB'],
plot=json_data['plot'],
poster=clear_url(json_data['urlPoster']) if ('urlPoster' in json_data and json_data['urlPoster'] != "") else None,
rated=json_data['rated'],
simple_plot=json_data['simplePlot'],
genres=json_data['genres'])
movie.original_title = json_data['title']
akas = json_data['akas']
for aka in akas:
if aka['country'] == 'Italy':
movie.title = aka['title']
return key
'''