Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bulk insert #6

Merged
merged 4 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 92 additions & 58 deletions insert.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,98 @@
import datetime
from typing import List, NamedTuple, Tuple

import psycopg2
from themoviedb.schemas import Movie, Person, Genre, Company
from themoviedb.schemas.countries import Country
from themoviedb.schemas.languages import Language

url = f"postgresql://postgres:[email protected]:5333/movie_db"

class MoviePopularity(NamedTuple):
movie_id: int
popularity: float
vote_average: float


class MovieChange(NamedTuple):
movie_id: int
datapoint: str
count: int


class MovieGenre(NamedTuple):
movie_id: int
genre_id: int


class MovieProductionCompany(NamedTuple):
movie_id: int
production_company_id: int


class MovieProductionCountry(NamedTuple):
movie_id: int
iso_3166_1: str


class MovieSpokenLanguage(NamedTuple):
movie_id: int
iso_639_1: str


conn = psycopg2.connect(url)
class PersonPopularity(NamedTuple):
person_id: int
popularity: float


cursor = conn.cursor()


def insert_movie(movie: Movie):
def insert_movie(movies: List[Movie]):
sql = """INSERT INTO movies
(id, title, original_title, imdb_id, overview, tagline, release_date, runtime, budget,
revenue, adult, video, backdrop_path, poster_path, homepage, status, original_language)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING"""

movie_data = (
movie.id, movie.title, movie.original_title, movie.imdb_id, movie.overview, movie.tagline,
movie.release_date, movie.runtime, movie.budget, movie.revenue, movie.adult, movie.video,
movie.backdrop_path, movie.poster_path, movie.homepage, movie.status, movie.original_language
)
movies_data = [(movie.id, movie.title, movie.original_title, movie.imdb_id, movie.overview, movie.tagline,
movie.release_date, movie.runtime, movie.budget, movie.revenue, movie.adult, movie.video,
movie.backdrop_path, movie.poster_path, movie.homepage, movie.status, movie.original_language)
for movie in movies]

cursor.execute(sql, movie_data)
cursor.executemany(sql, movies_data)
conn.commit()

insert_movie_popularity(movie.id, movie.popularity, movie.vote_average)
for genre in movie.genres:
insert_genre(genre)
insert_movie_genres(movie.id, genre.id)
# TODO: Should be faster without list comprehension when iterating over movies once
insert_movie_popularity([MoviePopularity(movie.id, movie.popularity, movie.vote_average) for movie in movies])
insert_genre([genre for movie in movies for genre in movie.genres])
insert_movie_genres([MovieGenre(movie.id, genre.id) for movie in movies for genre in movie.genres])

for company in movie.production_companies:
insert_production_company(company)
insert_movie_production_companies(movie.id, company.id)
insert_production_company([company for movie in movies for company in movie.production_companies])
insert_movie_production_companies(
[MovieProductionCompany(movie.id, company.id) for movie in movies for company in movie.production_companies])

for country in movie.production_countries:
insert_country(country)
insert_movie_production_countries(movie.id, country.iso_3166_1)
insert_country([country for movie in movies for country in movie.production_countries])
insert_movie_production_countries(
[MovieProductionCountry(movie.id, country.iso_3166_1) for movie in movies for country in
movie.production_countries])

for language in movie.spoken_languages:
insert_spoken_language(language)
insert_movie_spoken_languages(movie.id, language.iso_639_1)
insert_spoken_language([language for movie in movies for language in movie.spoken_languages])
insert_movie_spoken_languages(
[MovieSpokenLanguage(movie.id, language.iso_639_1) for movie in movies for language in movie.spoken_languages])


def insert_movie_popularity(movie_id: int, popularity: float, vote_average: float):
def insert_movie_popularity(popularity_data: List[MoviePopularity]):
sql = """INSERT INTO movies_popularity
(movie_id, popularity, vote_average, date)
VALUES
(%s, %s, %s, %s) ON CONFLICT (movie_id, date) DO NOTHING"""

movie_popularity_data = (movie_id, popularity, vote_average, datetime.date.today())
movie_popularity_data = [
(popularity.movie_id, popularity.popularity, popularity.vote_average, datetime.date.today()) for popularity in
popularity_data]

cursor.execute(sql, movie_popularity_data)
cursor.executemany(sql, movie_popularity_data)
conn.commit()


Expand All @@ -68,126 +108,120 @@ def insert_movie_change(movie_id: int, datapoint: str, count: int):
conn.commit()


def insert_genre(genre: Genre):
def insert_genre(genres: List[Genre]):
sql = """INSERT INTO genres
(id, name)
VALUES
(%s, %s) ON CONFLICT (id) DO NOTHING"""

genre_data = (genre.id, genre.name)

cursor.execute(sql, genre_data)
cursor.executemany(sql, [(genre.id, genre.name) for genre in genres])
conn.commit()


def insert_movie_genres(movie_id: int, genre_id: int):
def insert_movie_genres(movie_genres: List[MovieGenre]):
sql = """INSERT INTO moviegenres
(movie_id, genre_id)
VALUES
(%s, %s) ON CONFLICT (movie_id, genre_id) DO NOTHING"""

movie_genre_data = (movie_id, genre_id)

cursor.execute(sql, movie_genre_data)
cursor.executemany(sql, movie_genres)
conn.commit()


def insert_production_company(company: Company):
def insert_production_company(companies: List[Company]):
sql = """INSERT INTO productioncompanies
(id, name, logo_path, origin_country)
VALUES
(%s, %s, %s, %s) ON CONFLICT (id) DO NOTHING"""

production_company_data = (company.id, company.name, company.logo_path, company.origin_country)
production_company_data = [(company.id, company.name, company.logo_path, company.origin_country) for company in
companies]

cursor.execute(sql, production_company_data)
cursor.executemany(sql, production_company_data)
conn.commit()


def insert_movie_production_companies(movie_id: int, production_company_id: int):
def insert_movie_production_companies(companies: List[MovieProductionCompany]):
sql = """INSERT INTO movieproductioncompanies
(movie_id, production_company_id)
VALUES
(%s, %s) ON CONFLICT (movie_id, production_company_id) DO NOTHING"""

movie_production_company_data = (movie_id, production_company_id)

cursor.execute(sql, movie_production_company_data)
cursor.executemany(sql, companies)
conn.commit()


def insert_country(region: Country):
def insert_country(region: List[Country]):
sql = """INSERT INTO productioncountries
(iso_3166_1, name)
VALUES
(%s, %s) ON CONFLICT (iso_3166_1) DO NOTHING"""

region_data = (region.iso_3166_1, region.name)
region_data = [(region.iso_3166_1, region.name) for region in region]

cursor.execute(sql, region_data)
cursor.executemany(sql, region_data)
conn.commit()


def insert_movie_production_countries(movie_id: int, country_code: str):
def insert_movie_production_countries(countries: List[MovieProductionCountry]):
# country code follows ISO 3166-1
sql = """INSERT INTO movieproductioncountries
(movie_id, iso_3166_1)
VALUES
(%s, %s) ON CONFLICT (movie_id, iso_3166_1) DO NOTHING"""

movie_production_country_data = (movie_id, country_code)

cursor.execute(sql, movie_production_country_data)
cursor.executemany(sql, countries)
conn.commit()


def insert_spoken_language(language: Language):
def insert_spoken_language(languages: List[Language]):
sql = """INSERT INTO spokenlanguages
(iso_639_1, name, english_name)
VALUES
(%s, %s, %s) ON CONFLICT (iso_639_1) DO NOTHING"""

language_data = (language.iso_639_1, language.name, language.english_name)
language_data = [(language.iso_639_1, language.name, language.english_name) for language in languages]

cursor.execute(sql, language_data)
cursor.executemany(sql, language_data)
conn.commit()


def insert_movie_spoken_languages(movie_id: int, language_code: str):
def insert_movie_spoken_languages(languages: List[MovieSpokenLanguage]):
# language code follows ISO 639-1
sql = """INSERT INTO moviespokenlanguages
(movie_id, iso_639_1)
VALUES
(%s, %s) ON CONFLICT (movie_id, iso_639_1) DO NOTHING"""

movie_spoken_language_data = (movie_id, language_code)
movie_spoken_language_data = [(language.movie_id, language.iso_639_1) for language in languages]

cursor.execute(sql, movie_spoken_language_data)
cursor.executemany(sql, movie_spoken_language_data)
conn.commit()


def insert_person(person: Person):
def insert_person(persons: List[Person]):
sql = """INSERT INTO people
(id, name, gender, known_for_department, profile_path, adult)
VALUES
(%s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO NOTHING"""

person_data = (
person.id, person.name, person.gender, person.known_for_department, person.profile_path, person.adult)
person_data = [(
person.id, person.name, person.gender, person.known_for_department, person.profile_path, person.adult) for
person in persons]

cursor.execute(sql, person_data)
conn.commit()


def insert_person_popularity(person_id: int, popularity: float):
def insert_person_popularity(popularity: List[PersonPopularity]):
sql = """INSERT INTO people_popularity
(person_id, popularity, date)
VALUES
(%s, %s, %s) ON CONFLICT (person_id, date) DO NOTHING"""

person_popularity_data = (person_id, popularity, datetime.date.today())
popularity_data = [(person.person_id, person.popularity, datetime.date.today()) for person in popularity]

cursor.execute(sql, person_popularity_data)
cursor.execute(sql, popularity_data)
conn.commit()


Expand Down
9 changes: 1 addition & 8 deletions script.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,7 @@

start_time = time.time()
print("Inserting movies into database")
for i, movie in enumerate(all_movies):
try:
insert.insert_movie(movie)
except Exception as e:
print(f"Could not insert movie with id {movie.id}")
print(e)
if i % 25 == 0:
print(f"{i} of {len(all_movies)} inserted into database")
insert.insert_movie(all_movies)

end_time = time.time()
print(f"Inserting movies took {end_time - start_time} seconds to complete")
Expand Down
Loading