Skip to content

Commit

Permalink
Merge pull request #400 from nasaharvest/Add-data-Kenya-Crop-Area-2019
Browse files Browse the repository at this point in the history
Add Kenya Crop Area Data
  • Loading branch information
adebowaledaniel authored Jun 11, 2024
2 parents f1d15ab + 09a9ea3 commit 21869e1
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 6 deletions.
6 changes: 3 additions & 3 deletions data/datasets.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: add693a42b38c020b895e8e7c855210d.dir
size: 709637820
nfiles: 55
- md5: a250ce1c98882904e609f7abc88ed404.dir
size: 712601175
nfiles: 56
path: datasets
hash: md5
6 changes: 3 additions & 3 deletions data/raw.dvc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
outs:
- md5: 7b5c138e97f360400e6b3fc080d896e2.dir
size: 445019716
nfiles: 394
- md5: bb6ed42f700db44dba28965972d936cf.dir
size: 445220011
nfiles: 396
path: raw
hash: md5
9 changes: 9 additions & 0 deletions data/report.txt
Original file line number Diff line number Diff line change
Expand Up @@ -457,3 +457,12 @@ UgandaNorthCorLabel2022 (Timesteps: 24)
----------------------------------------------------------------------------
eo_data_complete 2975
✔ training amount: 2975, positive class: 51.6%



KenyaCropArea2019 (Timesteps: 24)
----------------------------------------------------------------------------
eo_data_complete 544
✔ training amount: 232, positive class: 23.7%
✔ validation amount: 149, positive class: 18.1%
✔ testing amount: 163, positive class: 20.2%
33 changes: 33 additions & 0 deletions datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,38 @@ def load_labels(self) -> pd.DataFrame:
return df


class KenyaCropArea2019(LabeledDataset):
def load_labels(self) -> pd.DataFrame:
raw_folder = raw_dir / "Kenya_Crop_Area_2019"
df1 = pd.read_csv(
raw_folder / "ceo-Kenya-Crop-Area-Estimation-Reference-Sample-2019---Set-1"
"-sample-data-2024-06-10.csv"
)
df2 = pd.read_csv(
raw_folder / "ceo-Kenya-Crop-Area-Estimation-Reference-Sample-2019---Set-2"
"-sample-data-2024-06-10.csv"
)
df = pd.concat([df1, df2])

# Discard rows with no label
df = df[~df["Does this point correspond to active cropland?"].isna()].copy()
df[CLASS_PROB] = df["Does this point correspond to active cropland?"] == "Crop"
df[CLASS_PROB] = df[CLASS_PROB].astype(int)
df["num_labelers"] = 1
df = df.groupby([LON, LAT], as_index=False, sort=False).agg(
{
CLASS_PROB: "mean",
"num_labelers": "sum",
"plotid": join_unique,
"sampleid": join_unique,
"email": join_unique,
}
)
df[START], df[END] = date(2019, 1, 1), date(2020, 12, 31)
df[SUBSET] = train_val_test_split(df.index, 0.3, 0.3)
return df


datasets: List[LabeledDataset] = [
CustomLabeledDataset(
dataset="geowiki_landcover_2017",
Expand Down Expand Up @@ -1340,6 +1372,7 @@ def load_labels(self) -> pd.DataFrame:
Uganda_NorthCEO2021(),
UgandaNorthCEO2019(),
UgandaNorthCorLabel2022(),
KenyaCropArea2019(),
]

if __name__ == "__main__":
Expand Down

0 comments on commit 21869e1

Please sign in to comment.