diff --git a/README.md b/README.md index 7db3459..1f221f5 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,15 @@ Probabilistic regression example on the Boston housing dataset: ```python from ngboost import NGBRegressor -from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error -X, Y = load_boston(True) +#Load Boston housing dataset +data_url = "http://lib.stat.cmu.edu/datasets/boston" +raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) +X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) +Y = raw_df.values[1::2, 2] + X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) ngb = NGBRegressor().fit(X_train, Y_train) diff --git a/examples/regression.py b/examples/regression.py index ee06ade..971a7b2 100644 --- a/examples/regression.py +++ b/examples/regression.py @@ -1,4 +1,5 @@ -from sklearn.datasets import load_boston +import numpy as np +import pandas as pd from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split @@ -6,8 +7,12 @@ from ngboost.distns import Normal if __name__ == "__main__": + # Load Boston housing dataset + data_url = "http://lib.stat.cmu.edu/datasets/boston" + raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) + X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) + Y = raw_df.values[1::2, 2] - X, Y = load_boston(return_X_y=True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) ngb = NGBRegressor(Dist=Normal).fit(X_train, Y_train) diff --git a/examples/survival.py b/examples/survival.py index c830db2..32be7a5 100644 --- a/examples/survival.py +++ b/examples/survival.py @@ -1,5 +1,5 @@ import numpy as np -from sklearn.datasets import load_boston +import pandas as pd from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split @@ -7,8 +7,12 @@ from ngboost.distns import LogNormal if __name__ == "__main__": + # Load Boston housing dataset + data_url = "http://lib.stat.cmu.edu/datasets/boston" + raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) + X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) + Y = raw_df.values[1::2, 2] - X, Y = load_boston(return_X_y=True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) # introduce administrative censoring