Skip to content
This repository has been archived by the owner on Aug 13, 2024. It is now read-only.

Commit

Permalink
downloading from Google Cloud Storage instead of AWS S3 (and assuming…
Browse files Browse the repository at this point in the history
… local credentials for now)
  • Loading branch information
arifwider committed Sep 4, 2018
1 parent bb5219b commit 87dd0b0
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 16 deletions.
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ reflink==0.2.0
requests==2.19.1
rope==0.11.0
rsa==3.4.2
s3fs==0.1.2
schema==0.6.8
scikit-learn==0.19.1
scipy==1.0.0
Expand Down
1 change: 1 addition & 0 deletions run_decisiontree_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

set -e

python3 src/download_data.py
python3 src/splitter.py
python3 src/decision_tree.py
25 changes: 25 additions & 0 deletions src/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
from google.cloud import storage

def load_data():
gcsBucket = "continuous-intelligence"
key = "store47-2016.csv"

if not os.path.exists('data/raw'):
os.makedirs('data/raw')

if not os.path.exists("data/" + key):
client = storage.Client()
bucket = client.get_bucket(gcsBucket)
blob = bucket.get_blob(key)
blob.download_to_filename('data/raw/store47-2016.csv')


def main():
print("Loading data...")
load_data()
print("Finished downloading")


if __name__ == "__main__":
main()
15 changes: 0 additions & 15 deletions src/splitter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,5 @@
import os
import pandas as pd
import s3fs

def load_data():
s3bucket = "twde-datalab/"
key = "raw/store47-2016.csv"

if not os.path.exists('data/raw'):
os.makedirs('data/raw')

if not os.path.exists("data/" + key):
print("Downloading data...")
s3 = s3fs.S3FileSystem(anon=True)
s3.get(s3bucket + key, "data/" + key)

def get_validation_period(latest_date_train, days_back=15):
# for Kaggle we want from Wednesday to Thursday for a 15 day period
Expand All @@ -37,8 +24,6 @@ def write_data(table, filename):


def main():
# load_data()

print("Loading data...")
train = pd.read_csv("data/raw/store47-2016.csv")

Expand Down

0 comments on commit 87dd0b0

Please sign in to comment.