From a7e103470c74fedf227967ffff4e96d792635b58 Mon Sep 17 00:00:00 2001 From: mfloto <60036186+mfloto@users.noreply.github.com> Date: Mon, 4 Dec 2023 03:30:18 +0100 Subject: [PATCH 1/4] feat: Add bulk insert for movies and their popularity + genres --- insert.py | 59 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/insert.py b/insert.py index 5e18b6a..c8d0aa2 100644 --- a/insert.py +++ b/insert.py @@ -1,4 +1,5 @@ import datetime +from typing import List, NamedTuple, Tuple import psycopg2 from themoviedb.schemas import Movie, Person, Genre, Company @@ -6,30 +7,46 @@ from themoviedb.schemas.languages import Language url = f"postgresql://postgres:postgres@49.13.1.33:5333/movie_db" + +class MoviePopularity(NamedTuple): + movie_id: int + popularity: float + vote_average: float + + +class MovieChange(NamedTuple): + movie_id: int + datapoint: str + count: int + + +class MovieGenre(NamedTuple): + movie_id: int + genre_id: int + + conn = psycopg2.connect(url) cursor = conn.cursor() -def insert_movie(movie: Movie): +def insert_movie(movies: List[Movie]): sql = """INSERT INTO movies (id, title, original_title, imdb_id, overview, tagline, release_date, runtime, budget, revenue, adult, video, backdrop_path, poster_path, homepage, status, original_language) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING""" - movie_data = ( - movie.id, movie.title, movie.original_title, movie.imdb_id, movie.overview, movie.tagline, - movie.release_date, movie.runtime, movie.budget, movie.revenue, movie.adult, movie.video, - movie.backdrop_path, movie.poster_path, movie.homepage, movie.status, movie.original_language - ) + movies_data = [(movie.id, movie.title, movie.original_title, movie.imdb_id, movie.overview, movie.tagline, + movie.release_date, movie.runtime, movie.budget, movie.revenue, movie.adult, movie.video, + movie.backdrop_path, movie.poster_path, movie.homepage, movie.status, movie.original_language) + for movie in movies] - cursor.execute(sql, movie_data) + cursor.executemany(sql, movies_data) conn.commit() - insert_movie_popularity(movie.id, movie.popularity, movie.vote_average) - for genre in movie.genres: - insert_genre(genre) - insert_movie_genres(movie.id, genre.id) + insert_movie_popularity([MoviePopularity(movie.id, movie.popularity, movie.vote_average) for movie in movies]) + insert_genre([genre for movie in movies for genre in movie.genres]) + insert_movie_genres([(movie.id, genre.id) for movie in movies for genre in movie.genres]) for company in movie.production_companies: insert_production_company(company) @@ -44,15 +61,17 @@ def insert_movie(movie: Movie): insert_movie_spoken_languages(movie.id, language.iso_639_1) -def insert_movie_popularity(movie_id: int, popularity: float, vote_average: float): +def insert_movie_popularity(popularity_data: List[MoviePopularity]): sql = """INSERT INTO movies_popularity (movie_id, popularity, vote_average, date) VALUES (%s, %s, %s, %s) ON CONFLICT (movie_id, date) DO NOTHING""" - movie_popularity_data = (movie_id, popularity, vote_average, datetime.date.today()) + movie_popularity_data = [ + (popularity.movie_id, popularity.popularity, popularity.vote_average, datetime.date.today()) for popularity in + popularity_data] - cursor.execute(sql, movie_popularity_data) + cursor.executemany(sql, movie_popularity_data) conn.commit() @@ -68,27 +87,23 @@ def insert_movie_change(movie_id: int, datapoint: str, count: int): conn.commit() -def insert_genre(genre: Genre): +def insert_genre(genres: List[Genre]): sql = """INSERT INTO genres (id, name) VALUES (%s, %s) ON CONFLICT (id) DO NOTHING""" - genre_data = (genre.id, genre.name) - - cursor.execute(sql, genre_data) + cursor.executemany(sql, genres) conn.commit() -def insert_movie_genres(movie_id: int, genre_id: int): +def insert_movie_genres(movie_genres: List[Tuple[int, int]]): sql = """INSERT INTO moviegenres (movie_id, genre_id) VALUES (%s, %s) ON CONFLICT (movie_id, genre_id) DO NOTHING""" - movie_genre_data = (movie_id, genre_id) - - cursor.execute(sql, movie_genre_data) + cursor.execute(sql, movie_genres) conn.commit() From 28dfce43a89dd1186dfcb7fd61cb31e7ec9eff08 Mon Sep 17 00:00:00 2001 From: mfloto <60036186+mfloto@users.noreply.github.com> Date: Mon, 4 Dec 2023 14:58:49 +0100 Subject: [PATCH 2/4] feat: Add bulk insert for countries, companies, languages --- insert.py | 81 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/insert.py b/insert.py index c8d0aa2..0c6b313 100644 --- a/insert.py +++ b/insert.py @@ -6,7 +6,6 @@ from themoviedb.schemas.countries import Country from themoviedb.schemas.languages import Language -url = f"postgresql://postgres:postgres@49.13.1.33:5333/movie_db" class MoviePopularity(NamedTuple): movie_id: int @@ -25,6 +24,21 @@ class MovieGenre(NamedTuple): genre_id: int +class MovieProductionCompany(NamedTuple): + movie_id: int + production_company_id: int + + +class MovieProductionCountry(NamedTuple): + movie_id: int + iso_3166_1: str + + +class MovieSpokenLanguage(NamedTuple): + movie_id: int + iso_639_1: str + + conn = psycopg2.connect(url) cursor = conn.cursor() @@ -44,21 +58,23 @@ def insert_movie(movies: List[Movie]): cursor.executemany(sql, movies_data) conn.commit() + # TODO: Should be faster without list comprehension when iterating over movies once insert_movie_popularity([MoviePopularity(movie.id, movie.popularity, movie.vote_average) for movie in movies]) insert_genre([genre for movie in movies for genre in movie.genres]) - insert_movie_genres([(movie.id, genre.id) for movie in movies for genre in movie.genres]) + insert_movie_genres([MovieGenre(movie.id, genre.id) for movie in movies for genre in movie.genres]) - for company in movie.production_companies: - insert_production_company(company) - insert_movie_production_companies(movie.id, company.id) + insert_production_company([company for movie in movies for company in movie.production_companies]) + insert_movie_production_companies( + [MovieProductionCompany(movie.id, company.id) for movie in movies for company in movie.production_companies]) - for country in movie.production_countries: - insert_country(country) - insert_movie_production_countries(movie.id, country.iso_3166_1) + insert_country([country for movie in movies for country in movie.production_countries]) + insert_movie_production_countries( + [MovieProductionCountry(movie.id, country.iso_3166_1) for movie in movies for country in + movie.production_countries]) - for language in movie.spoken_languages: - insert_spoken_language(language) - insert_movie_spoken_languages(movie.id, language.iso_639_1) + insert_spoken_language([language for movie in movies for language in movie.spoken_languages]) + insert_movie_spoken_languages( + [MovieSpokenLanguage(movie.id, language.iso_639_1) for movie in movies for language in movie.spoken_languages]) def insert_movie_popularity(popularity_data: List[MoviePopularity]): @@ -93,91 +109,88 @@ def insert_genre(genres: List[Genre]): VALUES (%s, %s) ON CONFLICT (id) DO NOTHING""" - cursor.executemany(sql, genres) + cursor.executemany(sql, [(genre.id, genre.name) for genre in genres]) conn.commit() -def insert_movie_genres(movie_genres: List[Tuple[int, int]]): +def insert_movie_genres(movie_genres: List[MovieGenre]): sql = """INSERT INTO moviegenres (movie_id, genre_id) VALUES (%s, %s) ON CONFLICT (movie_id, genre_id) DO NOTHING""" - cursor.execute(sql, movie_genres) + cursor.executemany(sql, movie_genres) conn.commit() -def insert_production_company(company: Company): +def insert_production_company(companies: List[Company]): sql = """INSERT INTO productioncompanies (id, name, logo_path, origin_country) VALUES (%s, %s, %s, %s) ON CONFLICT (id) DO NOTHING""" - production_company_data = (company.id, company.name, company.logo_path, company.origin_country) + production_company_data = [(company.id, company.name, company.logo_path, company.origin_country) for company in + companies] - cursor.execute(sql, production_company_data) + cursor.executemany(sql, production_company_data) conn.commit() -def insert_movie_production_companies(movie_id: int, production_company_id: int): +def insert_movie_production_companies(companies: List[MovieProductionCompany]): sql = """INSERT INTO movieproductioncompanies (movie_id, production_company_id) VALUES (%s, %s) ON CONFLICT (movie_id, production_company_id) DO NOTHING""" - movie_production_company_data = (movie_id, production_company_id) - - cursor.execute(sql, movie_production_company_data) + cursor.executemany(sql, companies) conn.commit() -def insert_country(region: Country): +def insert_country(region: List[Country]): sql = """INSERT INTO productioncountries (iso_3166_1, name) VALUES (%s, %s) ON CONFLICT (iso_3166_1) DO NOTHING""" - region_data = (region.iso_3166_1, region.name) + region_data = [(region.iso_3166_1, region.name) for region in region] - cursor.execute(sql, region_data) + cursor.executemany(sql, region_data) conn.commit() -def insert_movie_production_countries(movie_id: int, country_code: str): +def insert_movie_production_countries(countries: List[MovieProductionCountry]): # country code follows ISO 3166-1 sql = """INSERT INTO movieproductioncountries (movie_id, iso_3166_1) VALUES (%s, %s) ON CONFLICT (movie_id, iso_3166_1) DO NOTHING""" - movie_production_country_data = (movie_id, country_code) - - cursor.execute(sql, movie_production_country_data) + cursor.executemany(sql, countries) conn.commit() -def insert_spoken_language(language: Language): +def insert_spoken_language(languages: List[Language]): sql = """INSERT INTO spokenlanguages (iso_639_1, name, english_name) VALUES (%s, %s, %s) ON CONFLICT (iso_639_1) DO NOTHING""" - language_data = (language.iso_639_1, language.name, language.english_name) + language_data = [(language.iso_639_1, language.name, language.english_name) for language in languages] - cursor.execute(sql, language_data) + cursor.executemany(sql, language_data) conn.commit() -def insert_movie_spoken_languages(movie_id: int, language_code: str): +def insert_movie_spoken_languages(languages: List[MovieSpokenLanguage]): # language code follows ISO 639-1 sql = """INSERT INTO moviespokenlanguages (movie_id, iso_639_1) VALUES (%s, %s) ON CONFLICT (movie_id, iso_639_1) DO NOTHING""" - movie_spoken_language_data = (movie_id, language_code) + movie_spoken_language_data = [(language.movie_id, language.iso_639_1) for language in languages] - cursor.execute(sql, movie_spoken_language_data) + cursor.executemany(sql, movie_spoken_language_data) conn.commit() From 6196f94be9084adfbcd498474726988249499f79 Mon Sep 17 00:00:00 2001 From: mfloto <60036186+mfloto@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:00:03 +0100 Subject: [PATCH 3/4] feat: Use new bulk insert --- script.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/script.py b/script.py index 55bfd21..8c2ca6d 100644 --- a/script.py +++ b/script.py @@ -43,14 +43,7 @@ start_time = time.time() print("Inserting movies into database") -for i, movie in enumerate(all_movies): - try: - insert.insert_movie(movie) - except Exception as e: - print(f"Could not insert movie with id {movie.id}") - print(e) - if i % 25 == 0: - print(f"{i} of {len(all_movies)} inserted into database") +insert.insert_movie(all_movies) end_time = time.time() print(f"Inserting movies took {end_time - start_time} seconds to complete") From 1688e9e30ca40d34798fb5e02f8d4ea75a549622 Mon Sep 17 00:00:00 2001 From: mfloto <60036186+mfloto@users.noreply.github.com> Date: Sat, 9 Dec 2023 21:00:17 +0100 Subject: [PATCH 4/4] feat: Add bulk insert for person and person popularity --- insert.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/insert.py b/insert.py index 0c6b313..96ac042 100644 --- a/insert.py +++ b/insert.py @@ -40,6 +40,11 @@ class MovieSpokenLanguage(NamedTuple): conn = psycopg2.connect(url) +class PersonPopularity(NamedTuple): + person_id: int + popularity: float + + cursor = conn.cursor() @@ -194,28 +199,29 @@ def insert_movie_spoken_languages(languages: List[MovieSpokenLanguage]): conn.commit() -def insert_person(person: Person): +def insert_person(persons: List[Person]): sql = """INSERT INTO people (id, name, gender, known_for_department, profile_path, adult) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING""" - person_data = ( - person.id, person.name, person.gender, person.known_for_department, person.profile_path, person.adult) + person_data = [( + person.id, person.name, person.gender, person.known_for_department, person.profile_path, person.adult) for + person in persons] cursor.execute(sql, person_data) conn.commit() -def insert_person_popularity(person_id: int, popularity: float): +def insert_person_popularity(popularity: List[PersonPopularity]): sql = """INSERT INTO people_popularity (person_id, popularity, date) VALUES (%s, %s, %s) ON CONFLICT (person_id, date) DO NOTHING""" - person_popularity_data = (person_id, popularity, datetime.date.today()) + popularity_data = [(person.person_id, person.popularity, datetime.date.today()) for person in popularity] - cursor.execute(sql, person_popularity_data) + cursor.execute(sql, popularity_data) conn.commit()