-
Notifications
You must be signed in to change notification settings - Fork 93
/
two_sigma_rental.py
101 lines (86 loc) · 3.07 KB
/
two_sigma_rental.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Download and preprocess datasets from the Two Sigma Rental competition on Kaggle"""
from typing import Union, List
from h2oaicore.data import CustomData
import datatable as dt
import numpy as np
import pandas as pd
from h2oaicore.systemutils import user_dir
import uuid
import importlib
subprocess = importlib.import_module("sub" + "process")
kaggle_username = "XXX"
kaggle_key = "XXX"
class TwoSigmaRental(CustomData):
@staticmethod
def create_data(
X: dt.Frame = None,
) -> Union[
str,
List[str],
dt.Frame,
List[dt.Frame],
np.ndarray,
List[np.ndarray],
pd.DataFrame,
List[pd.DataFrame],
]:
import os
from h2oaicore.systemutils_more import download
from h2oaicore.systemutils import config
if kaggle_username == "XXX" or not kaggle_username:
return []
os.putenv("KAGGLE_USERNAME", kaggle_username)
os.putenv("KAGGLE_KEY", kaggle_key)
# find sample submission file
temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
os.makedirs(temp_path, exist_ok=True)
sub_file_dir = os.path.join(temp_path, "kaggle_%s" % str(uuid.uuid4())[:4])
cmd_train = (
f"kaggle competitions download "
f"-c two-sigma-connect-rental-listing-inquiries "
f"-f train.json.zip "
f"-p {sub_file_dir} -q"
)
cmd_test = (
f"kaggle competitions download "
f"-c two-sigma-connect-rental-listing-inquiries "
f"-f test.json.zip "
f"-p {sub_file_dir} -q"
)
try:
subprocess.check_output(cmd_train.split(), timeout=120).decode("utf-8")
except TimeoutError:
raise TimeoutError("Took longer than %s seconds, increase timeout")
try:
subprocess.check_output(cmd_test.split(), timeout=120).decode("utf-8")
except TimeoutError:
raise TimeoutError("Took longer than %s seconds, increase timeout")
train = pd.read_json(os.path.join(sub_file_dir, "train.json.zip"))
test = pd.read_json(os.path.join(sub_file_dir, "test.json.zip"))
for df in [train, test]:
df["str_features"] = df["features"].apply(lambda x: " . ".join(x))
df["nb_features"] = df["features"].apply(len)
df["nb_photos"] = df["photos"].apply(len)
df["cat_address"] = df["street_address"] + " " + df["display_address"]
features = [
"bathrooms",
"bedrooms",
"building_id",
"created",
"description",
"display_address",
"latitude",
"listing_id",
"longitude",
"manager_id",
"price",
"street_address",
"str_features",
"nb_features",
"nb_photos",
"cat_address",
]
return {
"two_sigma_train": dt.Frame(train[features + ["interest_level"]]),
"two_sigma_test": dt.Frame(test[features]),
}