Skip to content

Commit

Permalink
Merge pull request #19 from OpenMined/madhava/fixes
Browse files Browse the repository at this point in the history
Madhava/fixes
  • Loading branch information
madhavajay authored Sep 30, 2024
2 parents 70b08cd + 8442a67 commit bd6dbed
Show file tree
Hide file tree
Showing 35 changed files with 1,545 additions and 1,763 deletions.
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.git
data
default_apps
dist
docker
notebooks
projects
tests
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
recursive-include syftbox *.html *.js *.css *.zip
recursive-include default_apps *.py *.sh *.html *.js *.css *.zip *.png *.txt *.csv
28 changes: 25 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,60 @@
|___/
```

# Quickstart User Installation

## install uv

curl -LsSf https://astral.sh/uv/install.sh | sh

## create a virtualenv somewhere

uv venv .venv

## install the wheel

uv pip install http://20.168.10.234:8080/wheel/syftbox-0.1.0-py3-none-any.whl --reinstall

## run the client

uv run syftbox client

# Quickstart Client Developer Installation

### Step 0: Open your terminal to the root of this Github repository
### Step 0: Open your terminal to the root of this Github repository

Begin by opening your terminal and navigating to the root directory of this github repository (so when you run 'ls' it should show folders like "syftbox", "server", "tests", etc.). Then run the commands in steps 1-4:

### Step 1: Install Homebrew

```
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
```

### Step 2: Install uv (using homebrew — which is better for this than pip)

```
brew install uv
```

### Step 3: Install a virtual environment using uv

```
uv venv
```

### Step 4: Install a relative version of uv.

```
uv pip install -e .
```

### Step 5: Run the client

```
syftbox client
uv run syftbox/client/client.py
```


# Alternative Options

### Run Client
Expand Down
22 changes: 16 additions & 6 deletions default_apps/adder/main.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
import json
import os

input_file_path = "../../[email protected]/app_pipelines/adder/inputs/data.json"
output_file_path = "../../[email protected]/app_pipelines/adder/done/data.json"
from syftbox.lib import ClientConfig

config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
client_config = ClientConfig.load(config_path)

input_folder = f"{client_config.sync_folder}/app_pipelines/adder/inputs/"
output_folder = f"{client_config.sync_folder}/app_pipelines/adder/done/"
os.makedirs(input_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

input_file_path = f"{input_folder}/data.json"
output_file_path = f"{output_folder}/data.json"

if os.path.exists(input_file_path):
with open(input_file_path, 'r') as f:
with open(input_file_path, "r") as f:
data = json.load(f)

data['datum'] += 1
data["datum"] += 1

with open(output_file_path, 'w') as f:
with open(output_file_path, "w") as f:
json.dump(data, f)

os.remove(input_file_path)
else:
print(f"Input file {input_file_path} does not exist.")
print(f"Input file {input_file_path} does not exist.")
7 changes: 0 additions & 7 deletions default_apps/manual_pipeline/manual_pipeline_app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

@dataclass
class SyftLink(Jsonable):
@classmethod
Expand Down Expand Up @@ -271,8 +270,6 @@ def create_datasite_import_path(datasite: str) -> str:
return import_path




@dataclass
class DatasiteManifest(Jsonable):
datasite: str
Expand All @@ -295,9 +292,6 @@ class Dataset:
sync_path: str





def extract_leftmost_email(text: str) -> str:
# Define a regex pattern to match an email address
email_regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
Expand Down Expand Up @@ -1230,7 +1224,6 @@ class TaskManifest(Jsonable):
write_back_denied_path: str



@dataclass
class PipelineActionRun(PipelineAction):
exit_code: int | None = None
Expand Down
4 changes: 4 additions & 0 deletions default_apps/netflix/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
temp/*
output/*
inputs/*
cache/*
57 changes: 57 additions & 0 deletions default_apps/netflix/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Netflix App

## Download your Netflix data

Go here and request your netflix data for download:
https://www.netflix.com/account/getmyinfo

## Get a TMDB API key

Signup here:
https://www.themoviedb.org/signup

Create an API key here:
https://www.themoviedb.org/settings/api

## Setup

Put the following files in the `inputs` folder:

- NetflixViewingHistory.csv (downloaded from netflix)
- TMDB_API_KEY.txt (put the key in this text file)
- missing_imdb_id.json (optional: put json in here to fix titles missing from TMDB)

## Create your Netflix Page

```
./run.sh
```

Force it to run again:

```
./run.sh --force
```

## Debugging

Check the temp folder for intermediate files that are generated.
You can view these dataframes in Pandas to see whats going on.
The main.py runs each step one after the other so you can look at the code where your
issue is happening.

## Missing IMDB file

The missing IMDB file is there so you can manually tell the system of an IMDB ID for a
particular title.

The format is:

```json
{
"Life: Primates": "tt1533395"
}
```

Each item can be partial or exact match but don't be too short as it will match other
titles with a string in string comparison.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
netflix_title,netflix_date,tmdb_id,tmdb_title,tmdb_media_type,tmdb_poster_url,homepage,imdb_id,facebook_id,instagram_id,twitter_id,genre_ids,genre_names,imdb_runtime_minutes,imdb_rating
Psych: Season 1: Pilot: Part 1,2024-08-21,1447,Psych,tv,https://image.tmdb.org/t/p/w500/fDI15gTVbtW5Sbv5QenqecRxWKJ.jpg,http://www.usanetwork.com/series/psych,tt0491738,PsychPeacock,PsychPeacock,PsychPeacock,"[35, 18, 9648, 80]","['Comedy', 'Drama', 'Mystery', 'Crime']",44,8.4
Monk: Season 1: Mr. Monk and the Candidate: Part 1,2024-08-12,1695,Monk,tv,https://image.tmdb.org/t/p/w500/3axGMbUecXXOPSeG47v2i9wK5y5.jpg,http://www.usanetwork.com/series/monk,tt0312172,,,,"[35, 80, 18, 9648]","['Comedy', 'Crime', 'Drama', 'Mystery']",44,8.1
3 Body Problem: Season 1: Countdown,2024-03-26,108545,3 Body Problem,tv,https://image.tmdb.org/t/p/w500/ykZ7hlShkdRQaL2aiieXdEMmrLb.jpg,https://www.netflix.com/title/81024821,tt13016388,,3bodyproblem,3body,"[10765, 9648, 18]","['Sci-Fi & Fantasy', 'Mystery', 'Drama']",60,7.5
Fool Me Once: Limited Series: Episode 1,2024-01-29,220801,Fool Me Once,tv,https://image.tmdb.org/t/p/w500/Ertv4WLEyHgi8zN4ldOKgPcGAZ.jpg,https://www.netflix.com/title/81588093,tt5611024,,,,"[18, 80, 9648]","['Drama', 'Crime', 'Mystery']",50,6.8
Exploding Kittens: Pilot,2024-07-19,219532,Exploding Kittens,tv,https://image.tmdb.org/t/p/w500/4WctqRtusYpTLHNkuVjQe4R51DZ.jpg,https://www.netflix.com/title/81459282,tt19734104,,,,"[16, 35]","['Animation', 'Comedy']",25,6.8
45 changes: 45 additions & 0 deletions default_apps/netflix/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

import pandas as pd

from syftbox.lib import ClientConfig, SyftVault, TabularDataset


def run():
try:
imdb_df = pd.read_csv("./temp/3_imdb.csv")

dataset_filename = "NetflixViewingHistory_TMDB_IMDB.csv"
imdb_mock_df = pd.read_csv("./data/NetflixViewingHistory_TMDB_IMDB.mock.csv")

if set(imdb_df.columns) != set(imdb_mock_df.columns):
raise Exception("Netflix real vs mock schema are different")

config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
client_config = ClientConfig.load(config_path)
manifest = client_config.manifest

# create public datasets folder
datasets_path = manifest.create_public_folder("datasets")

dataset_path = datasets_path / "netflix_tmdb_imdb"
csv_file = dataset_path / dataset_filename
os.makedirs(dataset_path, exist_ok=True)

# write mock data
imdb_mock_df.to_csv(csv_file)

dataset = TabularDataset.from_csv(
csv_file, name="Netflix_TMDB_IMDB", has_private=True
)
dataset.publish(manifest, overwrite=True)

# write private file
private_path = os.path.abspath(f"./output/{dataset_filename}")
imdb_df.to_csv(private_path)
print(f"> Writing private {dataset_filename} to {private_path}")

SyftVault.link_private(csv_file, private_path)

except Exception as e:
print("Failed to make dataset with dataset.py", e)
83 changes: 83 additions & 0 deletions default_apps/netflix/imdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import warnings

import pandas as pd
from utils import download_file

# Suppress only DtypeWarning
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)


download_urls = [
"https://datasets.imdbws.com/title.basics.tsv.gz",
"https://datasets.imdbws.com/title.ratings.tsv.gz",
]


def run():
try:
temp_folder = "./temp/"
output_file = "3_imdb.csv"

imdb_df = pd.read_csv("./temp/2_tmdb.csv")

for download_url in download_urls:
filename = os.path.basename(download_url)
file_path = f"{temp_folder}/{filename}"
if not os.path.exists(file_path):
print(f"> Downloading {download_url} to {file_path}")
download_file(download_url, temp_folder)
else:
# print(f"> File {file_path} already downloaded")
pass

titles = pd.read_csv(
temp_folder + "/title.basics.tsv.gz",
sep="\t",
compression="gzip",
)

title_ratings = pd.read_csv(
temp_folder + "/title.ratings.tsv.gz",
sep="\t",
compression="gzip",
)

titles_merged = titles.merge(title_ratings, on="tconst", how="right")
titles_cleaned = titles_merged.dropna()
titles_cleaned = titles_cleaned[titles_cleaned["isAdult"] == 0]

titles_cleaned["startYear"] = titles_cleaned["startYear"].replace("\\N", None)
titles_cleaned["runtimeMinutes"] = titles_cleaned["runtimeMinutes"].replace(
"\\N", None
)

df_merged = imdb_df.merge(
titles_cleaned[["tconst", "runtimeMinutes", "averageRating"]],
how="left",
left_on="imdb_id",
right_on="tconst",
)

df_merged = df_merged.rename(
columns={
"runtimeMinutes": "imdb_runtime_minutes",
"averageRating": "imdb_rating",
}
)

df_merged = df_merged.drop(columns=["tconst"])

path = os.path.abspath(temp_folder + "/" + output_file)
print(f"Writing {output_file} to {temp_folder}")
df_merged.to_csv(path, index=False)

except Exception as e:
import traceback

print(traceback.print_exc())
print("Failed to run imdb.py", e)


if __name__ == "__main__":
run()
Loading

0 comments on commit bd6dbed

Please sign in to comment.