downloading from Google Cloud Storage instead of AWS S3 (and assuming…

… local credentials for now)
ThoughtWorksInc · Sep 4, 2018 · 87dd0b0 · 87dd0b0
1 parent bb5219b
commit 87dd0b0
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 16 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -117,7 +117,6 @@ reflink==0.2.0
 requests==2.19.1
 rope==0.11.0
 rsa==3.4.2
-s3fs==0.1.2
 schema==0.6.8
 scikit-learn==0.19.1
 scipy==1.0.0

diff --git a/run_decisiontree_pipeline.sh b/run_decisiontree_pipeline.sh
@@ -2,5 +2,6 @@
 
 set -e
 
+python3 src/download_data.py
 python3 src/splitter.py
 python3 src/decision_tree.py
diff --git a/src/download_data.py b/src/download_data.py
@@ -0,0 +1,25 @@
+import os
+from google.cloud import storage
+
+def load_data():
+    gcsBucket = "continuous-intelligence"
+    key = "store47-2016.csv"
+
+    if not os.path.exists('data/raw'):
+        os.makedirs('data/raw')
+
+    if not os.path.exists("data/" + key):
+        client = storage.Client()
+        bucket = client.get_bucket(gcsBucket)
+        blob = bucket.get_blob(key)
+        blob.download_to_filename('data/raw/store47-2016.csv')
+
+
+def main():
+    print("Loading data...")
+    load_data()
+    print("Finished downloading")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/splitter.py b/src/splitter.py
@@ -1,18 +1,5 @@
 import os
 import pandas as pd
-import s3fs
-
-def load_data():
-    s3bucket = "twde-datalab/"
-    key = "raw/store47-2016.csv"
-
-    if not os.path.exists('data/raw'):
-        os.makedirs('data/raw')
-
-    if not os.path.exists("data/" + key):
-        print("Downloading data...")
-        s3 = s3fs.S3FileSystem(anon=True)
-        s3.get(s3bucket + key, "data/" + key)
 
 def get_validation_period(latest_date_train, days_back=15):
     # for Kaggle we want from Wednesday to Thursday for a 15 day period
@@ -37,8 +24,6 @@ def write_data(table, filename):
 
 
 def main():
-    # load_data()
-
     print("Loading data...")
     train = pd.read_csv("data/raw/store47-2016.csv")