diff --git a/notebooks/benchmarks_sandbox.ipynb b/notebooks/benchmarks_sandbox.ipynb index ab68c2a4..fa3887f5 100644 --- a/notebooks/benchmarks_sandbox.ipynb +++ b/notebooks/benchmarks_sandbox.ipynb @@ -9,7 +9,7 @@ "\n", "**Author**: Ivan Zvonkov\n", "\n", - "**Last Modified**: Jan 17, 2024\n", + "**Last Modified**: Feb 6, 2024\n", "\n", "**Description**: Code for benchmarking against different variations in models." ] @@ -64,7 +64,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/1v/87y9n_d5143c_6cp072v3b1c0000gn/T/ipykernel_33350/4119029012.py:4: DtypeWarning: Columns (17) have mixed types.Specify dtype option on import or set low_memory=False.\n", + "/var/folders/1v/87y9n_d5143c_6cp072v3b1c0000gn/T/ipykernel_25504/4119029012.py:4: DtypeWarning: Columns (17) have mixed types.Specify dtype option on import or set low_memory=False.\n", " df = d.load_df(to_np=True, disable_tqdm=True)\n" ] } @@ -1198,12 +1198,13 @@ }, { "cell_type": "code", - "execution_count": 144, - "id": "ec38c410", + "execution_count": 39, + "id": "0d54af51", "metadata": {}, "outputs": [], "source": [ - "!pip install einops -q" + "%load_ext autoreload\n", + "%autoreload 2" ] }, { @@ -1218,12 +1219,12 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 96, "id": "7b6e4903", "metadata": {}, "outputs": [], "source": [ - "from src.single_file_presto_v2 import Presto, DEVICE\n", + "from src.single_file_presto_v2 import Presto, DEVICE, Aggregate\n", "\n", "import numpy as np\n", "import torch\n", @@ -1232,16 +1233,6 @@ "from torch.utils.data import Dataset, DataLoader" ] }, - { - "cell_type": "code", - "execution_count": 64, - "id": "9ff1198a", - "metadata": {}, - "outputs": [], - "source": [ - "torch.tensor??" - ] - }, { "cell_type": "markdown", "id": "209caaaf", @@ -1252,7 +1243,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 97, "id": "aeec5ba1", "metadata": {}, "outputs": [], @@ -1282,17 +1273,7 @@ }, { "cell_type": "code", - "execution_count": 80, - "id": "d4a70d1f", - "metadata": {}, - "outputs": [], - "source": [ - "dataset = PrestoDataset(val_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, + "execution_count": 98, "id": "eaf12d47", "metadata": {}, "outputs": [], @@ -1302,7 +1283,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 99, "id": "87006785", "metadata": {}, "outputs": [], @@ -1313,7 +1294,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 100, "id": "a4cfaf77", "metadata": {}, "outputs": [], @@ -1324,21 +1305,35 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 101, "id": "cd9e5493", "metadata": {}, "outputs": [], "source": [ - "def generate_encodings(dataset):\n", + "def generate_encodings(dataset, aggregate):\n", " dataloader = DataLoader(dataset=dataset, batch_size=64, shuffle=False)\n", " feature_list = []\n", " for (x, latlons, dw, start_month, _) in tqdm(dataloader, desc=\"Encodings\", leave=False):\n", " with torch.no_grad():\n", - " encodings = (pretrained_model(x, dynamic_world=dw, latlons=latlons, month=start_month).cpu().numpy())\n", + " encodings = (pretrained_model(\n", + " x, dynamic_world=dw, latlons=latlons, month=start_month, aggregate=aggregate\n", + " ).cpu().numpy())\n", " feature_list.append(encodings)\n", " return np.concatenate(feature_list)" ] }, + { + "cell_type": "code", + "execution_count": 102, + "id": "a98ac408", + "metadata": {}, + "outputs": [], + "source": [ + "# Use Sklearn scaling of encodings\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, { "cell_type": "markdown", "id": "3da1c2e7", @@ -1349,14 +1344,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 104, "id": "e5b4ae73", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a4a4a0d01e0943faab809de125d203d6", + "model_id": "84cc0dcb39ea4ebb851fa5b918546d3c", "version_major": 2, "version_minor": 0 }, @@ -1399,7 +1394,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Mali_lower_CEO_2019: 0.5882352941176471\n" + "Mali_lower_CEO_2019: 0.6198830409356726\n" ] }, { @@ -1434,7 +1429,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Togo: 0.7563025210084034\n" + "Togo: 0.7317073170731708\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] }, { @@ -1465,11 +1474,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Rwanda: 0.6273458445040215\n" + "Rwanda: 0.6847290640394088\n" ] }, { @@ -1500,11 +1523,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Uganda: 0.46464646464646464\n" + "Uganda: 0.5098039215686275\n" ] }, { @@ -1535,11 +1572,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Ethiopia_Tigray_2020: 0.6412213740458016\n" + "Ethiopia_Tigray_2020: 0.671480144404332\n" ] }, { @@ -1574,7 +1625,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Ethiopia_Tigray_2021: 0.6699029126213593\n" + "Ethiopia_Tigray_2021: 0.7222222222222223\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] }, { @@ -1609,7 +1674,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Ethiopia_Bure_Jimma_2019: 0.8193548387096774\n" + "Ethiopia_Bure_Jimma_2019: 0.8571428571428571\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] }, { @@ -1640,11 +1719,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Ethiopia_Bure_Jimma_2020: 0.8803088803088803\n" + "Ethiopia_Bure_Jimma_2020: 0.8673835125448028\n" ] }, { @@ -1675,11 +1768,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Malawi_CEO_2020: 0.14285714285714288\n" + "Malawi_CEO_2020: 0.4079601990049751\n" ] }, { @@ -1710,11 +1817,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Tanzania_CEO_2019: 0.8032200357781754\n" + "Tanzania_CEO_2019: 0.8313155770782888\n" ] }, { @@ -1749,7 +1870,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sudan_Blue_Nile_CEO_2019: 0.8589341692789969\n" + "Sudan_Blue_Nile_CEO_2019: 0.9201101928374655\n" ] }, { @@ -1784,7 +1905,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "SudanBlueNileCEO2020: 0.7680608365019012\n" + "SudanBlueNileCEO2020: 0.7789473684210527\n" ] }, { @@ -1815,11 +1936,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Senegal_CEO_2022: 0.4444444444444444\n" + "Senegal_CEO_2022: 0.6244343891402715\n" ] }, { @@ -1854,7 +1989,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "SudanAlGadarefCEO2019: 0.5993031358885018\n" + "SudanAlGadarefCEO2019: 0.5892857142857143\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] }, { @@ -1885,11 +2034,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "SudanAlGadarefCEO2020: 0.6222222222222222\n" + "SudanAlGadarefCEO2020: 0.7209775967413442\n" ] }, { @@ -1920,11 +2083,25 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "SudanGedarefDarfurAlJazirah2022: 0.763157894736842\n" + "SudanGedarefDarfurAlJazirah2022: 0.7615658362989324\n" ] }, { @@ -1959,7 +2136,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "Uganda_NorthCEO2022: 0.28205128205128205\n" + "Uganda_NorthCEO2022: 0.4333333333333333\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/Caskroom/miniconda/base/envs/landcover-mapping/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" ] } ], @@ -1974,16 +2165,18 @@ " test_df = df[is_test] \n", " train_df = df[~is_test & is_local_lat & is_local_lon]\n", " \n", - " train_dataset = PrestoDataset(train_df, start_month=2)\n", - " test_dataset = PrestoDataset(test_df, start_month=2) \n", - " X_train = generate_encodings(train_dataset)\n", - " X_test = generate_encodings(test_dataset)\n", + " train_dataset = PrestoDataset(train_df, start_month=1)\n", + " test_dataset = PrestoDataset(test_df, start_month=1) \n", + " X_train = generate_encodings(train_dataset, Aggregate.BAND_GROUPS_MEAN)\n", + " X_test = generate_encodings(test_dataset, Aggregate.BAND_GROUPS_MEAN)\n", " \n", " y_train = train_df[\"is_crop\"].to_list() \n", " y_test = test_df[\"is_crop\"].to_list()\n", " \n", - " #model = LogisticRegression(class_weight=\"balanced\", max_iter=1000, random_state=DEFAULT_SEED)\n", - " model = RandomForestClassifier(class_weight=\"balanced\", random_state=DEFAULT_SEED)\n", + " model = LogisticRegression(class_weight=\"balanced\", max_iter=1000, random_state=DEFAULT_SEED)\n", + " #pipe = make_pipeline(StandardScaler(), model)\n", + " #pipe.fit(X_train, y_train)\n", + " #model = RandomForestClassifier(class_weight=\"balanced\", random_state=DEFAULT_SEED)\n", " model.fit(X_train, y_train)\n", " \n", " #y_pred = model.predict(X_test)\n", @@ -1996,18 +2189,20 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 105, "id": "83931aea", "metadata": {}, "outputs": [], "source": [ + "benchmark_name = \"Presto LR Feb-Feb F1 Score (no DW, band group encodings, per group LayerNorm)\"\n", "for dataset, f1 in f1_scores.items():\n", - " presto_benchmark.loc[presto_benchmark[\"Name\"] == dataset, \"Presto RF Mar-Mar F1 Score (no DW)\"] = f1" + " presto_benchmark.loc[presto_benchmark[\"Name\"] == dataset, benchmark_name] = f1\n", + " " ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 106, "id": "042873fd", "metadata": {}, "outputs": [ @@ -2036,8 +2231,10 @@ "