Skip to content

Commit

Permalink
Fix: Change dataset files name and location
Browse files Browse the repository at this point in the history
  • Loading branch information
luizmachado committed Aug 19, 2024
1 parent 1f40e1f commit 3aa32cc
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 13 deletions.
4 changes: 2 additions & 2 deletions cd4ml/filenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def _get_model_file_templates(model_results_dir):
def _get_problem_file_templates(raw_problem_data_dir):
file_names_problem = {
'groceries': {
'raw_grocery_data': '%s/groceries.csv' % raw_problem_data_dir,
'grocery_data_shuffled': '%s/groceries_shuffled.csv' % raw_problem_data_dir
'raw_grocery_data': '%s/store47-2016.csv' % raw_problem_data_dir,
'grocery_data_shuffled': '%s/store47-2016_shuffled.csv' % raw_problem_data_dir
},
'houses': {
'raw_house_data': '%s/house_sales.csv' % raw_problem_data_dir,
Expand Down
20 changes: 15 additions & 5 deletions cd4ml/problems/groceries/download_data/download_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from cd4ml.filenames import get_problem_files
from cd4ml.utils.utils import download_to_file_from_url, shuffle_csv_file
import zipfile
from pathlib import Path
import logging

download_params = {'key': 'store47-2016.csv',
'gcs_bucket': 'continuous-intelligence',
'base_url': 'https://storage.googleapis.com'}
logger = logging.getLogger(__name__)

download_params = {'key': 'store47-2016',
'gcs_bucket': 'raw/master',
'base_url': 'https://github.com/luizmachado/CDMLDataset'}


def get_grocery_url_and_files(problem_name):
Expand All @@ -13,12 +18,17 @@ def get_grocery_url_and_files(problem_name):
base_url = download_params['base_url']

filename = file_names['raw_grocery_data']
url = "%s/%s/%s" % (base_url, gcs_bucket, key)
url = "%s/%s/%s.zip" % (base_url, gcs_bucket, key)
filename_shuffled = file_names['grocery_data_shuffled']
return url, filename, filename_shuffled


def download(problem_name, use_cache=True):
url, filename, filename_shuffled = get_grocery_url_and_files(problem_name)
download_to_file_from_url(url, filename, use_cache=use_cache)
zipname = f"{filename}.zip"
download_to_file_from_url(url, zipname, use_cache=use_cache)
target_dir = Path(filename).parent
logger.inf(f"Unzipping: {zipname} @ {target_dir}")
with zipfile.ZipFile(zipname, 'r') as zip_ref:
zip_ref.extractall(target_dir)
shuffle_csv_file(filename, filename_shuffled)
6 changes: 4 additions & 2 deletions cd4ml/problems/houses/download_data/download_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from cd4ml.filenames import get_problem_files
from cd4ml.utils.utils import download_to_file_from_url

baseUri = "https://github.com/luizmachado/CD4ML/tree/master/dataset/"

download_params = {
'url': "https://github.com/dave31415/house_price/raw/master/data/house_data_100000.csv",
'url_lookup': "https://github.com/dave31415/house_price/raw/master/data/zip_lookup.csv"
'url': "https://github.com/luizmachado/CD4ML/raw/master/dataset/house_data_100000.csv",
'url_lookup': "https://github.com/luizmachado/CD4ML/raw/master/dataset/zip_lookup.csv"
}


Expand Down
3 changes: 1 addition & 2 deletions cd4ml/problems/iris/download_data/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from cd4ml.utils.utils import download_to_file_from_url

download_params = {
'url': "https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/"
"raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv"
'url': "https://github.com/luizmachado/CD4ML/raw/master/dataset/iris.csv"
}


Expand Down
5 changes: 3 additions & 2 deletions postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
FROM postgres:12.2
RUN apt-get update && apt-get install -y wget && \
RUN apt-get update && apt-get install -y wget unzip && \
mkdir -p /docker-entrypoint-initdb.d && \
wget https://storage.googleapis.com/continuous-intelligence/store47-2016.csv -O /docker-entrypoint-initdb.d/store47-2016.csv
wget https://github.com/luizmachado/CD4ML/blob/master/dataset/store47-2016.zip -O /docker-entrypoint-initdb.d/store47-2016.zip && \
unzip /docker-entrypoint-initdb.d/store47-2016.zip -d /docker-entrypoint-initdb.d/

COPY initialize.sql /docker-entrypoint-initdb.d/initialize.sql

0 comments on commit 3aa32cc

Please sign in to comment.