Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/data prep #2

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
138 changes: 138 additions & 0 deletions data/raw/AD_US_131_fields.geojson

Large diffs are not rendered by default.

Binary file added models/pd_regressor_5.1_high.joblib
Binary file not shown.
1 change: 1 addition & 0 deletions models/pd_regressor_5.1_high.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"te_rmse": [9.142660927048142, 9.250546475374382, 9.296886097703833, 8.960664582214694, 8.456411264683167], "te_mae": [7.3115223129115074, 7.244427195801722, 7.424989387474683, 7.137270768398381, 6.66581610689058], "te_mdae": [6.254551594724191, 5.977897959865217, 6.390087231483264, 6.0762831200812, 5.3869137841415125], "te_mape": [0.05695110142113954, 0.05627800941691244, 0.05759305238092461, 0.0561403088796722, 0.05166834636752828], "te_maxerr": [31.303509265616228, 32.9443650439532, 30.911916388048454, 30.2323062122303, 30.072081041596192], "te_r2": [0.3688891354460624, 0.3856377438869244, 0.3259309379114529, 0.24903529612370368, 0.3805291815234315], "te_cscore_ci0.95": [0.9572001783325903, 0.9454456892352655, 0.8622222222222222, 0.9773232547799022, 0.9816831683168317], "te_cscore_ci0.90": [0.9059295586268391, 0.9176814417924988, 0.8105050505050505, 0.9595375722543352, 0.9529702970297029], "te_cscore_ci0.68": [0.7133303611234953, 0.717486604968339, 0.637979797979798, 0.6931969764339706, 0.8178217821782178], "tr_rmse": [9.142660927048142, 9.250546475374382, 9.296886097703833, 8.960664582214694, 8.456411264683167], "tr_mae": [7.3115223129115074, 7.244427195801722, 7.424989387474683, 7.137270768398381, 6.66581610689058], "tr_mdae": [6.254551594724191, 5.977897959865217, 6.390087231483264, 6.0762831200812, 5.3869137841415125], "tr_mape": [0.05695110142113954, 0.05627800941691244, 0.05759305238092461, 0.0561403088796722, 0.05166834636752828], "tr_maxerr": [31.303509265616228, 32.9443650439532, 30.911916388048454, 30.2323062122303, 30.072081041596192], "tr_r2": [0.3688891354460624, 0.3856377438869244, 0.3259309379114529, 0.24903529612370368, 0.3805291815234315], "of_rmse": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mae": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mdae": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mape": [0.0, 0.0, 0.0, 0.0, 0.0], "of_maxerr": [0.0, 0.0, 0.0, 0.0, 0.0], "of_r2": [0.0, 0.0, 0.0, 0.0, 0.0], "tr_cscore_ci0.95": [0.9572001783325903, 0.9454456892352655, 0.8622222222222222, 0.9773232547799022, 0.9816831683168317], "tr_cscore_ci0.90": [0.9059295586268391, 0.9176814417924988, 0.8105050505050505, 0.9595375722543352, 0.9529702970297029], "tr_cscore_ci0.68": [0.7133303611234953, 0.717486604968339, 0.637979797979798, 0.6931969764339706, 0.8178217821782178], "holdout_yr": [2017, 2018, 2021, 2019, 2020], "exp_name": ["cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp"], "mdl_name": ["huber", "huber", "huber", "huber", "huber"], "epsilon": [5.1, 5.1, 5.1, 5.1, 5.1], "subset": ["high_pd", "high_pd", "high_pd", "high_pd", "high_pd"], "rmdl_name": ["huber_5.100_high_pd", "huber_5.100_high_pd", "huber_5.100_high_pd", "huber_5.100_high_pd", "huber_5.100_high_pd"]}
Binary file added models/pd_regressor_5.1_low.joblib
Binary file not shown.
1 change: 1 addition & 0 deletions models/pd_regressor_5.1_low.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"te_rmse": [9.910874876979987, 10.18716353789977, 10.451021089725732, 9.488082445830312, 10.085712976795644], "te_mae": [7.638095377890685, 7.8983068717723315, 8.103751081504328, 7.383601020753083, 7.809525277195414], "te_mdae": [6.416914322359375, 6.5664989489310415, 6.754511557710941, 6.1171353920659755, 6.360898626794622], "te_mape": [0.08380780754099125, 0.08322617353339982, 0.08679305894685799, 0.07971082010148066, 0.08552216320568143], "te_maxerr": [34.52268213250666, 33.36440018245486, 36.104621168552455, 34.96076872522259, 34.94948159100814], "te_r2": [0.6622722245594024, 0.6249077114938895, 0.6753406684678711, 0.701041053135023, 0.6365169689221437], "te_cscore_ci0.95": [0.9640831758034026, 0.9046849757673667, 0.9388794567062818, 0.9835255354200988, 0.9490740740740741], "te_cscore_ci0.90": [0.8865784499054821, 0.8804523424878837, 0.8930390492359932, 0.9686985172981878, 0.9058641975308642], "te_cscore_ci0.68": [0.7069943289224953, 0.7705977382875606, 0.5942275042444821, 0.7825370675453048, 0.7052469135802469], "tr_rmse": [9.910874876979987, 10.18716353789977, 10.451021089725732, 9.488082445830312, 10.085712976795644], "tr_mae": [7.638095377890685, 7.8983068717723315, 8.103751081504328, 7.383601020753083, 7.809525277195414], "tr_mdae": [6.416914322359375, 6.5664989489310415, 6.754511557710941, 6.1171353920659755, 6.360898626794622], "tr_mape": [0.08380780754099125, 0.08322617353339982, 0.08679305894685799, 0.07971082010148066, 0.08552216320568143], "tr_maxerr": [34.52268213250666, 33.36440018245486, 36.104621168552455, 34.96076872522259, 34.94948159100814], "tr_r2": [0.6622722245594024, 0.6249077114938895, 0.6753406684678711, 0.701041053135023, 0.6365169689221437], "of_rmse": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mae": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mdae": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mape": [0.0, 0.0, 0.0, 0.0, 0.0], "of_maxerr": [0.0, 0.0, 0.0, 0.0, 0.0], "of_r2": [0.0, 0.0, 0.0, 0.0, 0.0], "tr_cscore_ci0.95": [0.9640831758034026, 0.9046849757673667, 0.9388794567062818, 0.9835255354200988, 0.9490740740740741], "tr_cscore_ci0.90": [0.8865784499054821, 0.8804523424878837, 0.8930390492359932, 0.9686985172981878, 0.9058641975308642], "tr_cscore_ci0.68": [0.7069943289224953, 0.7705977382875606, 0.5942275042444821, 0.7825370675453048, 0.7052469135802469], "holdout_yr": [2018, 2021, 2017, 2020, 2019], "exp_name": ["cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp"], "mdl_name": ["huber", "huber", "huber", "huber", "huber"], "epsilon": [5.1, 5.1, 5.1, 5.1, 5.1], "subset": ["low_pd", "low_pd", "low_pd", "low_pd", "low_pd"], "rmdl_name": ["huber_5.100_low_pd", "huber_5.100_low_pd", "huber_5.100_low_pd", "huber_5.100_low_pd", "huber_5.100_low_pd"]}
Binary file added models/pd_regressor_5.1_mid.joblib
Binary file not shown.
1 change: 1 addition & 0 deletions models/pd_regressor_5.1_mid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"te_rmse": [9.782602352896815, 10.501739491047088, 10.33476923532141, 9.898015244157804, 9.484061281112433], "te_mae": [7.8931809499783165, 8.590236097611417, 8.425868096826735, 8.035183707382831, 7.598399471180953], "te_mdae": [6.795445247579522, 7.600694822755997, 7.377572871392175, 6.962310470051307, 6.313446771732558], "te_mape": [0.06779548415006528, 0.07389452372084739, 0.07244224412516408, 0.06959073245730658, 0.06663853067695374], "te_maxerr": [34.52814244346146, 35.90526403127805, 34.74703233653861, 34.54729643281124, 36.37979699903018], "te_r2": [0.5470065311517877, 0.5480285447629201, 0.4890528171324635, 0.5397178436365916, 0.36938119468753505], "te_cscore_ci0.95": [0.9743414300376325, 0.9112513924990717, 0.9232472324723248, 0.9652875175315568, 0.9871692060946271], "te_cscore_ci0.90": [0.9633937735203558, 0.8410694392870405, 0.8306273062730627, 0.9474053295932678, 0.9775461106655974], "te_cscore_ci0.68": [0.8347588094423537, 0.5373189751206833, 0.6140221402214022, 0.7300140252454418, 0.8063352044907779], "tr_rmse": [9.782602352896815, 10.501739491047088, 10.33476923532141, 9.898015244157804, 9.484061281112433], "tr_mae": [7.8931809499783165, 8.590236097611417, 8.425868096826735, 8.035183707382831, 7.598399471180953], "tr_mdae": [6.795445247579522, 7.600694822755997, 7.377572871392175, 6.962310470051307, 6.313446771732558], "tr_mape": [0.06779548415006528, 0.07389452372084739, 0.07244224412516408, 0.06959073245730658, 0.06663853067695374], "tr_maxerr": [34.52814244346146, 35.90526403127805, 34.74703233653861, 34.54729643281124, 36.37979699903018], "tr_r2": [0.5470065311517877, 0.5480285447629201, 0.4890528171324635, 0.5397178436365916, 0.36938119468753505], "of_rmse": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mae": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mdae": [0.0, 0.0, 0.0, 0.0, 0.0], "of_mape": [0.0, 0.0, 0.0, 0.0, 0.0], "of_maxerr": [0.0, 0.0, 0.0, 0.0, 0.0], "of_r2": [0.0, 0.0, 0.0, 0.0, 0.0], "tr_cscore_ci0.95": [0.9743414300376325, 0.9112513924990717, 0.9232472324723248, 0.9652875175315568, 0.9871692060946271], "tr_cscore_ci0.90": [0.9633937735203558, 0.8410694392870405, 0.8306273062730627, 0.9474053295932678, 0.9775461106655974], "tr_cscore_ci0.68": [0.8347588094423537, 0.5373189751206833, 0.6140221402214022, 0.7300140252454418, 0.8063352044907779], "holdout_yr": [2021, 2018, 2017, 2020, 2019], "exp_name": ["cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp", "cv3mdl_w_cp"], "mdl_name": ["huber", "huber", "huber", "huber", "huber"], "epsilon": [5.1, 5.1, 5.1, 5.1, 5.1], "subset": ["mid_pd", "mid_pd", "mid_pd", "mid_pd", "mid_pd"], "rmdl_name": ["huber_5.100_mid_pd", "huber_5.100_mid_pd", "huber_5.100_mid_pd", "huber_5.100_mid_pd", "huber_5.100_mid_pd"]}
Binary file added models/pre_classifier.joblib
Binary file not shown.
Binary file added models/pre_classifier.pkl
Binary file not shown.
95 changes: 95 additions & 0 deletions notebooks/download_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"id": "f9d1204b-82b1-4bd2-8b9e-1c2fd012dd00",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"..\")\n",
"import os\n",
"import scripts.user_config as config\n",
"from src.credentials import CREDENTIALS\n",
"from src.data_downloader import DataDownloader"
]
},
{
"cell_type": "markdown",
"id": "b8408bb1-c8d9-48b1-a8eb-4c69627b8966",
"metadata": {},
"source": [
"### Loading credentials and constant"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "a77e3fdc-1274-418d-840c-999123354f64",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"os.environ['SH_CLIENT_SECRET'] = CREDENTIALS['SH_CLIENT_SECRET']\n",
"os.environ['SH_CLIENT_ID'] = CREDENTIALS['SH_CLIENT_ID']\n",
"os.environ['SH_INSTANCE_ID'] = CREDENTIALS['SH_INSTANCE_ID']\n",
"\n",
"FEATURE_COLLECTION_FILE_PATH = \"../data/raw/AD_US_131_fields.geojson\"\n",
"RANGE_DATES = ['2018-01-01', '2023-12-31']\n",
"PARAMS = {'SAT': 'S2',\n",
" 'CLOUD_FRAC_TILE_MAX': 0.8,\n",
" 'CLOUD_FRAC_FIELD_MAX': 0.05,\n",
" 'CLP_THRESHOLD': 0.1}"
]
},
{
"cell_type": "markdown",
"id": "59577d35-9d8d-4575-b4f3-9de0523118cd",
"metadata": {
"tags": []
},
"source": [
"### Download the data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "42c18362-6eda-4fd0-bebb-9dd939b12035",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"downloader = DataDownloader(FEATURE_COLLECTION_FILE_PATH)\n",
"#downloader.download(range_dates=RANGE_DATES, params=PARAMS,\n",
"# output_path='../data/raw/nematode_fields/')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
305 changes: 305 additions & 0 deletions notebooks/normalize_seasons.ipynb

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions scripts/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# General imports
import sys
import os
sys.path.append("../")
par_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(1, par_dir)

# class specific imports
from src import credentials, data_downloader
from scripts import user_config as user_config

if __name__ == "__main__":
os.environ['SH_CLIENT_SECRET'] = credentials.CREDENTIALS['SH_CLIENT_SECRET']
os.environ['SH_CLIENT_ID'] = credentials.CREDENTIALS['SH_CLIENT_ID']
os.environ['SH_INSTANCE_ID'] = credentials.CREDENTIALS['SH_INSTANCE_ID']

filename = user_config.FEATURE_COLLECTION_FILE
downloader = data_downloader.DataDownloader(filename)
downloader.download(range_dates=user_config.RANGE_DATES, params=user_config.PARAMS,
output_path='../data/raw/nematode_fields/')
print("Success!")
69 changes: 69 additions & 0 deletions scripts/normalize_seasons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# General imports
import sys
import os

par_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(1, par_dir)
import numpy as np
import matplotlib.pyplot as plt

# Class specific imports
from src import normalization_builder
from scripts import user_config as user_config, utils

if __name__ == "__main__":
# ---------------------------------------------------
# Load NETCDF4 files for the specified FIELD_ID
# ---------------------------------------------------
data = utils.load_data(user_config.NETCDF4_INPUT_FOLDER)
# FIELD_ID = "field_148"
FIELD_ID = "field_126"
selected_field = [x for x in list(data.keys()) if FIELD_ID in x]

# ---------------------------------------------------
# Compute the (lon, lat) of the specified FIELD_ID
# ---------------------------------------------------
(lon, lat) = (np.mean(list(data[selected_field[0]]['x'].data)), np.mean(list(data[selected_field[0]]['y'].data)))

# ---------------------------------------------------
# Compute NDVI rasters and dates for all seasons
# ---------------------------------------------------
dates, all_rasters = [], []
for i in range(0, len(selected_field)):
dates += utils.compute_dates(data[selected_field[i]])
all_rasters += utils.create_rasters(data[selected_field[i]])

# -------------------------------------------------------
# Compute NDVI time_series for all seasons
# -------------------------------------------------------
ndvi_time_series = [np.nanmedian(x) for x in all_rasters]

# -------------------------------------------------------------------------
# Normalize the seasons: this is main interface with Normalization Class!
# -------------------------------------------------------------------------
print("-> Estimating planting dates:")
normalizer = normalization_builder.NormalizationBuilder()
shift_dict = normalizer.normalize_time_series(ndvi_time_series, dates, (lon, lat))
print(f"\n-> Planting_dates:")
print(f"{shift_dict['planting_dates']}\n")
print(f"-> Deltas:")
print(f"{shift_dict['deltas']}\n")

# -------------------------------------------------------------------------
# Using the normalization computed
# -------------------------------------------------------------------------
df = normalizer.create_planting_date_dataframe(ndvi_time_series, dates, (lon, lat))
years = ['2017', '2018', '2019', '2020', '2021', '2022', '2023']
plt.subplots(1, 2, figsize=(15, 5))
ax1 = plt.subplot(121)
for season in years:
x = [i for i in range(1, 366)]
y = list(df[df['cropzone'] == season]['NDVI'])
ax1.plot(x, y)

ax2 = plt.subplot(122)
for season in years:
x = [(i - shift_dict['deltas'][season]) for i in range(1, 366)]
y = list(df[df['cropzone'] == season]['NDVI'])
ax2.plot(x, y)
plt.show()
7 changes: 7 additions & 0 deletions scripts/user_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FEATURE_COLLECTION_FILE = "../data/raw/AD_US_131_fields.geojson"
NETCDF4_INPUT_FOLDER = "../data/raw/nematode_fields/"
RANGE_DATES = ['2018-01-01', '2023-12-31']
PARAMS = {'SAT': 'S2',
'CLOUD_FRAC_TILE_MAX': 0.8,
'CLOUD_FRAC_FIELD_MAX': 0.05,
'CLP_THRESHOLD': 0.1}
39 changes: 39 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import glob
import numpy as np
import xarray as xr
import datetime


def load_data(path: str) -> dict[xr.core.dataset.Dataset]:
files = glob.glob(path + "*.nc")
files = np.sort(files)

field_data_dict = {}
for file in files:
key = file.split("/")
key = key[len(key) - 1]
# field_data = xr.open_dataset(file)
field_data_dict[key] = xr.open_dataset(file)
return field_data_dict


def compute_dates(field_data: xr.core.dataset.Dataset) -> list[datetime]:
dates = []
for t in range(0, len(field_data.time)):
date_format = '%Y-%m-%d'
date_obj = field_data['time'][t].time.dt.date.item()
dates.append(date_obj.strftime(date_format))
return dates


def create_ndvi(ds: xr.core.dataset.Dataset, time: int) -> np.array:
ndvi = (ds['B08'][time].data - ds['B04'][time].data)/(ds['B08'][time].data + ds['B04'][time].data)
return ndvi


def create_rasters(field_data: xr.core.dataset.Dataset) -> list[np.ndarray]:
rasters = []
for t in range(0, len(field_data.time)):
raster = create_ndvi(field_data, t)
rasters.append(raster)
return rasters
7 changes: 7 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
MODELS_PARAMS = {'MODELS_PATH': "../models",
'TEST_1REG_CP_ALPHA': 0.1,
'TEST_PRECLF_FILE': "pre_classifier.joblib",
'TEST_3REG_FILES_LOW': "pd_regressor_5.1_low.joblib",
'TEST_3REG_FILES_MID': "pd_regressor_5.1_mid.joblib",
'TEST_3REG_FILES_HIGH': "pd_regressor_5.1_high.joblib"}
DAYS_INTERP = 365
82 changes: 82 additions & 0 deletions src/data_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import glob
import geojson
import shapely
from shapely.geometry import shape
from cads.tools import SenHub, tools, params, VegIndices


class DataDownloader:
def __init__(self, pathname: str) -> None:
"""
Constructor of the class: it loads the feature collection file indicated in "filename".
:param pathname: (str) pathname of the feature collection file.
"""

with open(pathname) as f:
self.feature_collection = geojson.load(f)

def get_feature_collection(self) -> geojson.feature.FeatureCollection:
"""
Returns the feature collection loaded.

:return: (geojson.feature.FeatureCollection) feature collection.
"""

return self.feature_collection

def get_polygon(self, index: int) -> shapely.geometry.multipolygon.MultiPolygon:
"""
Returns the geometry of the i-th polygon in the feature collection.

:param index: (int) index of the i-th polygon in the feature collection.
:return: (MultiPolygon) shapely geometry.
"""

poly = {'type': 'MultiPolygon',
'coordinates': [self.feature_collection['features'][index]['geometry']['coordinates']]}
return shape(poly)

@staticmethod
def get_downloaded_fields_list(path: str) -> list[str]:
"""
Returns a list of indexes of files already downloaded in the folder indicated by "path".

:param path: (str) path of the folder containing downloaded files.
:return: (list[int]) list of indexes of files already downloaded.
"""

already_downloaded_fields = glob.glob(f"{path}*.nc")
already_downloaded_fields = [x.split('field_')[1] for x in already_downloaded_fields]
already_downloaded_fields = [x.split('.')[0] for x in already_downloaded_fields]
return list(set(already_downloaded_fields))

def download(self, range_dates: list[str], params: dict, output_path: str, force=False) -> None:
"""
Download data in the specified range of dates.

:param range_dates: (list[str]) list containing start and end dates for downloading.
:param params: (dict) dictionary with parameters used for download (cloud percentage, satellite name, etc).
:param output_path: (str) output folder where downloaded files will be saved to. Currently, files are saved
only in NETCDF4 format.
:param force: (Bool) flag to force or prevent re-download.
:return:
"""

season_start = int(range_dates[0].split('-')[0])
season_end = int(range_dates[1].split('-')[0])
already_downloaded_fields = self.get_downloaded_fields_list(output_path)
for feature_index in range(0, len(self.feature_collection['features'])):
polygon = self.get_polygon(index=feature_index)
if not force:
if str(feature_index) in already_downloaded_fields:
print(f"Field {feature_index} already downloaded.")
continue
for season in range(season_start, season_end):
satimgs = SenHub(sat=params['SAT'], product='', sitename='field_' + str(feature_index),
outdir=output_path, aoi=polygon,
start_date=str(season) + '-01-01', end_date=str(season) + '-12-31',
cloud_frac_tile_max=params['CLOUD_FRAC_TILE_MAX'],
cloud_frac_field_max=params['CLOUD_FRAC_FIELD_MAX'],
clp_threshold=params['CLP_THRESHOLD'])
satimgs.request()

Loading