-
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from OpenMined/madhava/fixes
Madhava/fixes
- Loading branch information
Showing
35 changed files
with
1,545 additions
and
1,763 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.git | ||
data | ||
default_apps | ||
dist | ||
docker | ||
notebooks | ||
projects | ||
tests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
recursive-include syftbox *.html *.js *.css *.zip | ||
recursive-include default_apps *.py *.sh *.html *.js *.css *.zip *.png *.txt *.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,28 @@ | ||
import json | ||
import os | ||
|
||
input_file_path = "../../[email protected]/app_pipelines/adder/inputs/data.json" | ||
output_file_path = "../../[email protected]/app_pipelines/adder/done/data.json" | ||
from syftbox.lib import ClientConfig | ||
|
||
config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None) | ||
client_config = ClientConfig.load(config_path) | ||
|
||
input_folder = f"{client_config.sync_folder}/app_pipelines/adder/inputs/" | ||
output_folder = f"{client_config.sync_folder}/app_pipelines/adder/done/" | ||
os.makedirs(input_folder, exist_ok=True) | ||
os.makedirs(output_folder, exist_ok=True) | ||
|
||
input_file_path = f"{input_folder}/data.json" | ||
output_file_path = f"{output_folder}/data.json" | ||
|
||
if os.path.exists(input_file_path): | ||
with open(input_file_path, 'r') as f: | ||
with open(input_file_path, "r") as f: | ||
data = json.load(f) | ||
|
||
data['datum'] += 1 | ||
data["datum"] += 1 | ||
|
||
with open(output_file_path, 'w') as f: | ||
with open(output_file_path, "w") as f: | ||
json.dump(data, f) | ||
|
||
os.remove(input_file_path) | ||
else: | ||
print(f"Input file {input_file_path} does not exist.") | ||
print(f"Input file {input_file_path} does not exist.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
temp/* | ||
output/* | ||
inputs/* | ||
cache/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# Netflix App | ||
|
||
## Download your Netflix data | ||
|
||
Go here and request your netflix data for download: | ||
https://www.netflix.com/account/getmyinfo | ||
|
||
## Get a TMDB API key | ||
|
||
Signup here: | ||
https://www.themoviedb.org/signup | ||
|
||
Create an API key here: | ||
https://www.themoviedb.org/settings/api | ||
|
||
## Setup | ||
|
||
Put the following files in the `inputs` folder: | ||
|
||
- NetflixViewingHistory.csv (downloaded from netflix) | ||
- TMDB_API_KEY.txt (put the key in this text file) | ||
- missing_imdb_id.json (optional: put json in here to fix titles missing from TMDB) | ||
|
||
## Create your Netflix Page | ||
|
||
``` | ||
./run.sh | ||
``` | ||
|
||
Force it to run again: | ||
|
||
``` | ||
./run.sh --force | ||
``` | ||
|
||
## Debugging | ||
|
||
Check the temp folder for intermediate files that are generated. | ||
You can view these dataframes in Pandas to see whats going on. | ||
The main.py runs each step one after the other so you can look at the code where your | ||
issue is happening. | ||
|
||
## Missing IMDB file | ||
|
||
The missing IMDB file is there so you can manually tell the system of an IMDB ID for a | ||
particular title. | ||
|
||
The format is: | ||
|
||
```json | ||
{ | ||
"Life: Primates": "tt1533395" | ||
} | ||
``` | ||
|
||
Each item can be partial or exact match but don't be too short as it will match other | ||
titles with a string in string comparison. |
6 changes: 6 additions & 0 deletions
6
default_apps/netflix/data/NetflixViewingHistory_TMDB_IMDB.mock.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
netflix_title,netflix_date,tmdb_id,tmdb_title,tmdb_media_type,tmdb_poster_url,homepage,imdb_id,facebook_id,instagram_id,twitter_id,genre_ids,genre_names,imdb_runtime_minutes,imdb_rating | ||
Psych: Season 1: Pilot: Part 1,2024-08-21,1447,Psych,tv,https://image.tmdb.org/t/p/w500/fDI15gTVbtW5Sbv5QenqecRxWKJ.jpg,http://www.usanetwork.com/series/psych,tt0491738,PsychPeacock,PsychPeacock,PsychPeacock,"[35, 18, 9648, 80]","['Comedy', 'Drama', 'Mystery', 'Crime']",44,8.4 | ||
Monk: Season 1: Mr. Monk and the Candidate: Part 1,2024-08-12,1695,Monk,tv,https://image.tmdb.org/t/p/w500/3axGMbUecXXOPSeG47v2i9wK5y5.jpg,http://www.usanetwork.com/series/monk,tt0312172,,,,"[35, 80, 18, 9648]","['Comedy', 'Crime', 'Drama', 'Mystery']",44,8.1 | ||
3 Body Problem: Season 1: Countdown,2024-03-26,108545,3 Body Problem,tv,https://image.tmdb.org/t/p/w500/ykZ7hlShkdRQaL2aiieXdEMmrLb.jpg,https://www.netflix.com/title/81024821,tt13016388,,3bodyproblem,3body,"[10765, 9648, 18]","['Sci-Fi & Fantasy', 'Mystery', 'Drama']",60,7.5 | ||
Fool Me Once: Limited Series: Episode 1,2024-01-29,220801,Fool Me Once,tv,https://image.tmdb.org/t/p/w500/Ertv4WLEyHgi8zN4ldOKgPcGAZ.jpg,https://www.netflix.com/title/81588093,tt5611024,,,,"[18, 80, 9648]","['Drama', 'Crime', 'Mystery']",50,6.8 | ||
Exploding Kittens: Pilot,2024-07-19,219532,Exploding Kittens,tv,https://image.tmdb.org/t/p/w500/4WctqRtusYpTLHNkuVjQe4R51DZ.jpg,https://www.netflix.com/title/81459282,tt19734104,,,,"[16, 35]","['Animation', 'Comedy']",25,6.8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import os | ||
|
||
import pandas as pd | ||
|
||
from syftbox.lib import ClientConfig, SyftVault, TabularDataset | ||
|
||
|
||
def run(): | ||
try: | ||
imdb_df = pd.read_csv("./temp/3_imdb.csv") | ||
|
||
dataset_filename = "NetflixViewingHistory_TMDB_IMDB.csv" | ||
imdb_mock_df = pd.read_csv("./data/NetflixViewingHistory_TMDB_IMDB.mock.csv") | ||
|
||
if set(imdb_df.columns) != set(imdb_mock_df.columns): | ||
raise Exception("Netflix real vs mock schema are different") | ||
|
||
config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None) | ||
client_config = ClientConfig.load(config_path) | ||
manifest = client_config.manifest | ||
|
||
# create public datasets folder | ||
datasets_path = manifest.create_public_folder("datasets") | ||
|
||
dataset_path = datasets_path / "netflix_tmdb_imdb" | ||
csv_file = dataset_path / dataset_filename | ||
os.makedirs(dataset_path, exist_ok=True) | ||
|
||
# write mock data | ||
imdb_mock_df.to_csv(csv_file) | ||
|
||
dataset = TabularDataset.from_csv( | ||
csv_file, name="Netflix_TMDB_IMDB", has_private=True | ||
) | ||
dataset.publish(manifest, overwrite=True) | ||
|
||
# write private file | ||
private_path = os.path.abspath(f"./output/{dataset_filename}") | ||
imdb_df.to_csv(private_path) | ||
print(f"> Writing private {dataset_filename} to {private_path}") | ||
|
||
SyftVault.link_private(csv_file, private_path) | ||
|
||
except Exception as e: | ||
print("Failed to make dataset with dataset.py", e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import os | ||
import warnings | ||
|
||
import pandas as pd | ||
from utils import download_file | ||
|
||
# Suppress only DtypeWarning | ||
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning) | ||
|
||
|
||
download_urls = [ | ||
"https://datasets.imdbws.com/title.basics.tsv.gz", | ||
"https://datasets.imdbws.com/title.ratings.tsv.gz", | ||
] | ||
|
||
|
||
def run(): | ||
try: | ||
temp_folder = "./temp/" | ||
output_file = "3_imdb.csv" | ||
|
||
imdb_df = pd.read_csv("./temp/2_tmdb.csv") | ||
|
||
for download_url in download_urls: | ||
filename = os.path.basename(download_url) | ||
file_path = f"{temp_folder}/{filename}" | ||
if not os.path.exists(file_path): | ||
print(f"> Downloading {download_url} to {file_path}") | ||
download_file(download_url, temp_folder) | ||
else: | ||
# print(f"> File {file_path} already downloaded") | ||
pass | ||
|
||
titles = pd.read_csv( | ||
temp_folder + "/title.basics.tsv.gz", | ||
sep="\t", | ||
compression="gzip", | ||
) | ||
|
||
title_ratings = pd.read_csv( | ||
temp_folder + "/title.ratings.tsv.gz", | ||
sep="\t", | ||
compression="gzip", | ||
) | ||
|
||
titles_merged = titles.merge(title_ratings, on="tconst", how="right") | ||
titles_cleaned = titles_merged.dropna() | ||
titles_cleaned = titles_cleaned[titles_cleaned["isAdult"] == 0] | ||
|
||
titles_cleaned["startYear"] = titles_cleaned["startYear"].replace("\\N", None) | ||
titles_cleaned["runtimeMinutes"] = titles_cleaned["runtimeMinutes"].replace( | ||
"\\N", None | ||
) | ||
|
||
df_merged = imdb_df.merge( | ||
titles_cleaned[["tconst", "runtimeMinutes", "averageRating"]], | ||
how="left", | ||
left_on="imdb_id", | ||
right_on="tconst", | ||
) | ||
|
||
df_merged = df_merged.rename( | ||
columns={ | ||
"runtimeMinutes": "imdb_runtime_minutes", | ||
"averageRating": "imdb_rating", | ||
} | ||
) | ||
|
||
df_merged = df_merged.drop(columns=["tconst"]) | ||
|
||
path = os.path.abspath(temp_folder + "/" + output_file) | ||
print(f"Writing {output_file} to {temp_folder}") | ||
df_merged.to_csv(path, index=False) | ||
|
||
except Exception as e: | ||
import traceback | ||
|
||
print(traceback.print_exc()) | ||
print("Failed to run imdb.py", e) | ||
|
||
|
||
if __name__ == "__main__": | ||
run() |
Oops, something went wrong.