Skip to content

Commit

Permalink
Merge pull request #16 from madhavajay/madhava/progress
Browse files Browse the repository at this point in the history
Lots of refactoring
  • Loading branch information
madhavajay authored Sep 27, 2024
2 parents 157aa23 + af6ae44 commit 2b5b344
Show file tree
Hide file tree
Showing 46 changed files with 1,059 additions and 254 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,7 @@ dist
syftbox.egg-info
keys/**
scheduler.lock
jobs.sqlite
jobs.sqlite
notebooks/crypto
netflix_data/*
backup/*
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ repos:
always_run: true
- id: check-added-large-files
always_run: true
exclude: '.*Pyfhel-3\.4\.2-cp311-cp311-macosx_13_0_arm64\.whl|.*syftbox-0.1.0-py3-none-any\.whl'
- id: check-yaml
always_run: true
- id: check-merge-conflict
Expand Down
4 changes: 4 additions & 0 deletions app.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
cp -r ./apps/$1 ./users/$2/apps/$1
rm -rf ./users/$2/apps/$1/output
rm ./users/$2/apps/$1/cache/last_run.json
File renamed without changes.
File renamed without changes.
6 changes: 6 additions & 0 deletions apps/netflix/data/NetflixViewingHistory_TMDB_IMDB.mock.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
netflix_title,netflix_date,tmdb_id,tmdb_title,tmdb_media_type,tmdb_poster_url,homepage,imdb_id,facebook_id,instagram_id,twitter_id,genre_ids,genre_names,imdb_runtime_minutes,imdb_rating
Psych: Season 1: Pilot: Part 1,2024-08-21,1447,Psych,tv,https://image.tmdb.org/t/p/w500/fDI15gTVbtW5Sbv5QenqecRxWKJ.jpg,http://www.usanetwork.com/series/psych,tt0491738,PsychPeacock,PsychPeacock,PsychPeacock,"[35, 18, 9648, 80]","['Comedy', 'Drama', 'Mystery', 'Crime']",44,8.4
Monk: Season 1: Mr. Monk and the Candidate: Part 1,2024-08-12,1695,Monk,tv,https://image.tmdb.org/t/p/w500/3axGMbUecXXOPSeG47v2i9wK5y5.jpg,http://www.usanetwork.com/series/monk,tt0312172,,,,"[35, 80, 18, 9648]","['Comedy', 'Crime', 'Drama', 'Mystery']",44,8.1
3 Body Problem: Season 1: Countdown,2024-03-26,108545,3 Body Problem,tv,https://image.tmdb.org/t/p/w500/ykZ7hlShkdRQaL2aiieXdEMmrLb.jpg,https://www.netflix.com/title/81024821,tt13016388,,3bodyproblem,3body,"[10765, 9648, 18]","['Sci-Fi & Fantasy', 'Mystery', 'Drama']",60,7.5
Fool Me Once: Limited Series: Episode 1,2024-01-29,220801,Fool Me Once,tv,https://image.tmdb.org/t/p/w500/Ertv4WLEyHgi8zN4ldOKgPcGAZ.jpg,https://www.netflix.com/title/81588093,tt5611024,,,,"[18, 80, 9648]","['Drama', 'Crime', 'Mystery']",50,6.8
Exploding Kittens: Pilot,2024-07-19,219532,Exploding Kittens,tv,https://image.tmdb.org/t/p/w500/4WctqRtusYpTLHNkuVjQe4R51DZ.jpg,https://www.netflix.com/title/81459282,tt19734104,,,,"[16, 35]","['Animation', 'Comedy']",25,6.8
45 changes: 45 additions & 0 deletions apps/netflix/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

import pandas as pd

from syftbox.lib import ClientConfig, SyftVault, TabularDataset


def run():
try:
imdb_df = pd.read_csv("./temp/3_imdb.csv")

dataset_filename = "NetflixViewingHistory_TMDB_IMDB.csv"
imdb_mock_df = pd.read_csv("./data/NetflixViewingHistory_TMDB_IMDB.mock.csv")

if set(imdb_df.columns) != set(imdb_mock_df.columns):
raise Exception("Netflix real vs mock schema are different")

config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
client_config = ClientConfig.load(config_path)
manifest = client_config.manifest

# create public datasets folder
datasets_path = manifest.create_public_folder("datasets")

dataset_path = datasets_path / "netflix_tmdb_imdb"
csv_file = dataset_path / dataset_filename
os.makedirs(dataset_path, exist_ok=True)

# write mock data
imdb_mock_df.to_csv(csv_file)

dataset = TabularDataset.from_csv(
csv_file, name="Netflix_TMDB_IMDB", has_private=True
)
dataset.publish(manifest, overwrite=True)

# write private file
private_path = os.path.abspath(f"./output/{dataset_filename}")
imdb_df.to_csv(private_path)
print(f"> Writing private {dataset_filename} to {private_path}")

SyftVault.link_private(csv_file, private_path)

except Exception as e:
print("Failed to make dataset with dataset.py", e)
4 changes: 4 additions & 0 deletions syftbox/client/apps/netflix/imdb.py → apps/netflix/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,7 @@ def run():

print(traceback.print_exc())
print("Failed to run imdb.py", e)


if __name__ == "__main__":
run()
2 changes: 2 additions & 0 deletions syftbox/client/apps/netflix/main.py → apps/netflix/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os

from dataset import run as make_dataset
from imdb import run as add_imdb_data
from netflix import run as preprocess_netflix
from page import run as make_page
Expand Down Expand Up @@ -72,6 +73,7 @@ def main():
preprocess_netflix()
get_tmdb_data(tmdb_api_key, missing_file)
add_imdb_data()
make_dataset()
make_page()

last_run = {"input_hash": input_hash}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ def run():

except Exception as e:
print("Failed to run netflix.py", e)


if __name__ == "__main__":
run()
6 changes: 5 additions & 1 deletion syftbox/client/apps/netflix/page.py → apps/netflix/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,8 @@ def run():
import traceback

print(traceback.print_exc())
print("Failed to run html.py", e)
print("Failed to run page.py", e)


if __name__ == "__main__":
run()
File renamed without changes.
3 changes: 2 additions & 1 deletion syftbox/client/apps/netflix/run.sh → apps/netflix/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ uv venv .venv
uv pip install -r requirements.txt
TMDB_API_KEY=$(cat inputs/TMDB_API_KEY.txt)

uv run main.py --tmdb-api-key=$TMDB_API_KEY --missing-imdb-file=inputs/missing_imdb_id.json $( [ "$1" = "--force" ] && echo '--force' )
uv run python -c "import syftbox; print(syftbox.__version__)"
uv run main.py --tmdb-api-key=$TMDB_API_KEY --missing-imdb-file=inputs/missing_imdb_id.json "$@"
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
3 changes: 2 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
rm -rf dist
uv build
uv build
cp dist/syftbox-0.1.0-py3-none-any.whl ./
2 changes: 2 additions & 0 deletions davo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
uv run syftbox/client/client.py --config_path=./users/davo.json --sync_folder=./users/davo [email protected] --port=8089 --server=http://localhost:5001
1 change: 1 addition & 0 deletions me.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
#!/bin/bash
export SYFTBOX_DEV="true"
uv run syftbox/client/client.py --config_path=./users/me.json --sync_folder=./users/me [email protected] --port=8085 --server=http://localhost:5001
51 changes: 37 additions & 14 deletions notebooks/01-trade-create.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@
"metadata": {},
"outputs": [],
"source": [
"csv_file = datasets_path / \"trade_data\" / \"trade_mock.csv\"\n",
"csv_file"
"dataset_path = datasets_path / \"trade_data\"\n",
"dataset_path"
]
},
{
Expand All @@ -188,7 +188,9 @@
"metadata": {},
"outputs": [],
"source": [
"mock_ca_data.to_csv(csv_file)"
"import os\n",
"\n",
"os.makedirs(dataset_path, exist_ok=True)"
]
},
{
Expand All @@ -197,6 +199,27 @@
"id": "19",
"metadata": {},
"outputs": [],
"source": [
"csv_file = dataset_path / \"trade_mock.csv\"\n",
"csv_file"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20",
"metadata": {},
"outputs": [],
"source": [
"mock_ca_data.to_csv(csv_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21",
"metadata": {},
"outputs": [],
"source": [
"dataset = TabularDataset.from_csv(csv_file, name=\"Trade Data\", has_private=True)\n",
"dataset"
Expand All @@ -205,7 +228,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "20",
"id": "22",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -215,7 +238,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "21",
"id": "23",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -225,7 +248,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "22",
"id": "24",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -235,14 +258,14 @@
{
"cell_type": "code",
"execution_count": null,
"id": "23",
"id": "25",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "24",
"id": "26",
"metadata": {},
"source": [
"# Link Private Data"
Expand All @@ -251,7 +274,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "25",
"id": "27",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -261,7 +284,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "26",
"id": "28",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -272,7 +295,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "27",
"id": "29",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -282,7 +305,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "28",
"id": "30",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -292,7 +315,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "29",
"id": "31",
"metadata": {},
"outputs": [],
"source": []
Expand All @@ -314,7 +337,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 2b5b344

Please sign in to comment.