Merge pull request #19 from OpenMined/madhava/fixes

Madhava/fixes
OpenMined · Sep 30, 2024 · bd6dbed · bd6dbed
2 parents 70b08cd + 8442a67
commit bd6dbed
Show file tree

Hide file tree

Showing 35 changed files with 1,545 additions and 1,763 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+.git
+data
+default_apps
+dist
+docker
+notebooks
+projects
+tests
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1 +1,2 @@
 recursive-include syftbox *.html *.js *.css *.zip
+recursive-include default_apps *.py *.sh *.html *.js *.css *.zip *.png *.txt *.csv
diff --git a/README.md b/README.md
@@ -7,38 +7,60 @@
        |___/
 ```
 
+# Quickstart User Installation
+
+## install uv
+
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+## create a virtualenv somewhere
+
+uv venv .venv
+
+## install the wheel
+
+uv pip install http://20.168.10.234:8080/wheel/syftbox-0.1.0-py3-none-any.whl --reinstall
+
+## run the client
+
+uv run syftbox client
+
 # Quickstart Client Developer Installation
 
-### Step 0: Open your terminal to the root of this Github repository 
+### Step 0: Open your terminal to the root of this Github repository
 
 Begin by opening your terminal and navigating to the root directory of this github repository (so when you run 'ls' it should show folders like "syftbox", "server", "tests", etc.). Then run the commands in steps 1-4:
 
 ### Step 1: Install Homebrew
+
 ```
 /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
 ```
 
 ### Step 2: Install uv (using homebrew — which is better for this than pip)
+
 ```
 brew install uv
 ```
 
 ### Step 3: Install a virtual environment using uv
+
 ```
 uv venv
 ```
 
 ### Step 4: Install a relative version of uv.
+
 ```
 uv pip install -e .
 ```
 
 ### Step 5: Run the client
+
 ```
-syftbox client
+uv run syftbox/client/client.py
 ```
 
-
 # Alternative Options
 
 ### Run Client

diff --git a/default_apps/adder/main.py b/default_apps/adder/main.py
@@ -1,18 +1,28 @@
 import json
 import os
 
-input_file_path = "../../[email protected]/app_pipelines/adder/inputs/data.json"
-output_file_path = "../../[email protected]/app_pipelines/adder/done/data.json"
+from syftbox.lib import ClientConfig
+
+config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
+client_config = ClientConfig.load(config_path)
+
+input_folder = f"{client_config.sync_folder}/app_pipelines/adder/inputs/"
+output_folder = f"{client_config.sync_folder}/app_pipelines/adder/done/"
+os.makedirs(input_folder, exist_ok=True)
+os.makedirs(output_folder, exist_ok=True)
+
+input_file_path = f"{input_folder}/data.json"
+output_file_path = f"{output_folder}/data.json"
 
 if os.path.exists(input_file_path):
-    with open(input_file_path, 'r') as f:
+    with open(input_file_path, "r") as f:
         data = json.load(f)
 
-    data['datum'] += 1
+    data["datum"] += 1
 
-    with open(output_file_path, 'w') as f:
+    with open(output_file_path, "w") as f:
         json.dump(data, f)
 
     os.remove(input_file_path)
 else:
-    print(f"Input file {input_file_path} does not exist.")
+    print(f"Input file {input_file_path} does not exist.")
diff --git a/default_apps/manual_pipeline/manual_pipeline_app.py b/default_apps/manual_pipeline/manual_pipeline_app.py
@@ -1,4 +1,3 @@
-
 @dataclass
 class SyftLink(Jsonable):
     @classmethod
@@ -271,8 +270,6 @@ def create_datasite_import_path(datasite: str) -> str:
     return import_path
 
 
-
-
 @dataclass
 class DatasiteManifest(Jsonable):
     datasite: str
@@ -295,9 +292,6 @@ class Dataset:
         sync_path: str
 
 
-
-
-
 def extract_leftmost_email(text: str) -> str:
     # Define a regex pattern to match an email address
     email_regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
@@ -1230,7 +1224,6 @@ class TaskManifest(Jsonable):
     write_back_denied_path: str
 
 
-
 @dataclass
 class PipelineActionRun(PipelineAction):
     exit_code: int | None = None

diff --git a/default_apps/netflix/.gitignore b/default_apps/netflix/.gitignore
@@ -0,0 +1,4 @@
+temp/*
+output/*
+inputs/*
+cache/*
diff --git a/default_apps/netflix/README.md b/default_apps/netflix/README.md
@@ -0,0 +1,57 @@
+# Netflix App
+
+## Download your Netflix data
+
+Go here and request your netflix data for download:
+https://www.netflix.com/account/getmyinfo
+
+## Get a TMDB API key
+
+Signup here:
+https://www.themoviedb.org/signup
+
+Create an API key here:
+https://www.themoviedb.org/settings/api
+
+## Setup
+
+Put the following files in the `inputs` folder:
+
+- NetflixViewingHistory.csv (downloaded from netflix)
+- TMDB_API_KEY.txt (put the key in this text file)
+- missing_imdb_id.json (optional: put json in here to fix titles missing from TMDB)
+
+## Create your Netflix Page
+
+```
+./run.sh
+```
+
+Force it to run again:
+
+```
+./run.sh --force
+```
+
+## Debugging
+
+Check the temp folder for intermediate files that are generated.
+You can view these dataframes in Pandas to see whats going on.
+The main.py runs each step one after the other so you can look at the code where your
+issue is happening.
+
+## Missing IMDB file
+
+The missing IMDB file is there so you can manually tell the system of an IMDB ID for a
+particular title.
+
+The format is:
+
+```json
+{
+  "Life: Primates": "tt1533395"
+}
+```
+
+Each item can be partial or exact match but don't be too short as it will match other
+titles with a string in string comparison.
diff --git a/default_apps/netflix/data/NetflixViewingHistory_TMDB_IMDB.mock.csv b/default_apps/netflix/data/NetflixViewingHistory_TMDB_IMDB.mock.csv
@@ -0,0 +1,6 @@
+netflix_title,netflix_date,tmdb_id,tmdb_title,tmdb_media_type,tmdb_poster_url,homepage,imdb_id,facebook_id,instagram_id,twitter_id,genre_ids,genre_names,imdb_runtime_minutes,imdb_rating
+Psych: Season 1: Pilot: Part 1,2024-08-21,1447,Psych,tv,https://image.tmdb.org/t/p/w500/fDI15gTVbtW5Sbv5QenqecRxWKJ.jpg,http://www.usanetwork.com/series/psych,tt0491738,PsychPeacock,PsychPeacock,PsychPeacock,"[35, 18, 9648, 80]","['Comedy', 'Drama', 'Mystery', 'Crime']",44,8.4
+Monk: Season 1: Mr. Monk and the Candidate: Part 1,2024-08-12,1695,Monk,tv,https://image.tmdb.org/t/p/w500/3axGMbUecXXOPSeG47v2i9wK5y5.jpg,http://www.usanetwork.com/series/monk,tt0312172,,,,"[35, 80, 18, 9648]","['Comedy', 'Crime', 'Drama', 'Mystery']",44,8.1
+3 Body Problem: Season 1: Countdown,2024-03-26,108545,3 Body Problem,tv,https://image.tmdb.org/t/p/w500/ykZ7hlShkdRQaL2aiieXdEMmrLb.jpg,https://www.netflix.com/title/81024821,tt13016388,,3bodyproblem,3body,"[10765, 9648, 18]","['Sci-Fi & Fantasy', 'Mystery', 'Drama']",60,7.5
+Fool Me Once: Limited Series: Episode 1,2024-01-29,220801,Fool Me Once,tv,https://image.tmdb.org/t/p/w500/Ertv4WLEyHgi8zN4ldOKgPcGAZ.jpg,https://www.netflix.com/title/81588093,tt5611024,,,,"[18, 80, 9648]","['Drama', 'Crime', 'Mystery']",50,6.8
+Exploding Kittens: Pilot,2024-07-19,219532,Exploding Kittens,tv,https://image.tmdb.org/t/p/w500/4WctqRtusYpTLHNkuVjQe4R51DZ.jpg,https://www.netflix.com/title/81459282,tt19734104,,,,"[16, 35]","['Animation', 'Comedy']",25,6.8
diff --git a/default_apps/netflix/dataset.py b/default_apps/netflix/dataset.py
@@ -0,0 +1,45 @@
+import os
+
+import pandas as pd
+
+from syftbox.lib import ClientConfig, SyftVault, TabularDataset
+
+
+def run():
+    try:
+        imdb_df = pd.read_csv("./temp/3_imdb.csv")
+
+        dataset_filename = "NetflixViewingHistory_TMDB_IMDB.csv"
+        imdb_mock_df = pd.read_csv("./data/NetflixViewingHistory_TMDB_IMDB.mock.csv")
+
+        if set(imdb_df.columns) != set(imdb_mock_df.columns):
+            raise Exception("Netflix real vs mock schema are different")
+
+        config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
+        client_config = ClientConfig.load(config_path)
+        manifest = client_config.manifest
+
+        # create public datasets folder
+        datasets_path = manifest.create_public_folder("datasets")
+
+        dataset_path = datasets_path / "netflix_tmdb_imdb"
+        csv_file = dataset_path / dataset_filename
+        os.makedirs(dataset_path, exist_ok=True)
+
+        # write mock data
+        imdb_mock_df.to_csv(csv_file)
+
+        dataset = TabularDataset.from_csv(
+            csv_file, name="Netflix_TMDB_IMDB", has_private=True
+        )
+        dataset.publish(manifest, overwrite=True)
+
+        # write private file
+        private_path = os.path.abspath(f"./output/{dataset_filename}")
+        imdb_df.to_csv(private_path)
+        print(f"> Writing private {dataset_filename} to {private_path}")
+
+        SyftVault.link_private(csv_file, private_path)
+
+    except Exception as e:
+        print("Failed to make dataset with dataset.py", e)
diff --git a/default_apps/netflix/imdb.py b/default_apps/netflix/imdb.py
@@ -0,0 +1,83 @@
+import os
+import warnings
+
+import pandas as pd
+from utils import download_file
+
+# Suppress only DtypeWarning
+warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
+
+
+download_urls = [
+    "https://datasets.imdbws.com/title.basics.tsv.gz",
+    "https://datasets.imdbws.com/title.ratings.tsv.gz",
+]
+
+
+def run():
+    try:
+        temp_folder = "./temp/"
+        output_file = "3_imdb.csv"
+
+        imdb_df = pd.read_csv("./temp/2_tmdb.csv")
+
+        for download_url in download_urls:
+            filename = os.path.basename(download_url)
+            file_path = f"{temp_folder}/{filename}"
+            if not os.path.exists(file_path):
+                print(f"> Downloading {download_url} to {file_path}")
+                download_file(download_url, temp_folder)
+            else:
+                # print(f"> File {file_path} already downloaded")
+                pass
+
+        titles = pd.read_csv(
+            temp_folder + "/title.basics.tsv.gz",
+            sep="\t",
+            compression="gzip",
+        )
+
+        title_ratings = pd.read_csv(
+            temp_folder + "/title.ratings.tsv.gz",
+            sep="\t",
+            compression="gzip",
+        )
+
+        titles_merged = titles.merge(title_ratings, on="tconst", how="right")
+        titles_cleaned = titles_merged.dropna()
+        titles_cleaned = titles_cleaned[titles_cleaned["isAdult"] == 0]
+
+        titles_cleaned["startYear"] = titles_cleaned["startYear"].replace("\\N", None)
+        titles_cleaned["runtimeMinutes"] = titles_cleaned["runtimeMinutes"].replace(
+            "\\N", None
+        )
+
+        df_merged = imdb_df.merge(
+            titles_cleaned[["tconst", "runtimeMinutes", "averageRating"]],
+            how="left",
+            left_on="imdb_id",
+            right_on="tconst",
+        )
+
+        df_merged = df_merged.rename(
+            columns={
+                "runtimeMinutes": "imdb_runtime_minutes",
+                "averageRating": "imdb_rating",
+            }
+        )
+
+        df_merged = df_merged.drop(columns=["tconst"])
+
+        path = os.path.abspath(temp_folder + "/" + output_file)
+        print(f"Writing {output_file} to {temp_folder}")
+        df_merged.to_csv(path, index=False)
+
+    except Exception as e:
+        import traceback
+
+        print(traceback.print_exc())
+        print("Failed to run imdb.py", e)
+
+
+if __name__ == "__main__":
+    run()
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		recursive-include syftbox .html .js .css .zip
		recursive-include default_apps .py .sh .html .js .css .zip .png .txt *.csv