nasaharvest · adebowaledaniel · Jun 11, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/data/datasets.dvc b/data/datasets.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: d70a2e3e7b64b45737ab212c869b76e1.dir
-  size: 693418938
-  nfiles: 54
+- md5: 236b7e654fe6dae45a4981aaafb06544.dir
+  size: 696382293
+  nfiles: 55
   path: datasets
   hash: md5
diff --git a/data/raw.dvc b/data/raw.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 03ecf7c484499852ddf368170fa7201e.dir
-  size: 444911361
-  nfiles: 393
+- md5: 449a9d3f354cd9161b60943f7734f6c1.dir
+  size: 445111656
+  nfiles: 395
   path: raw
   hash: md5
diff --git a/data/report.txt b/data/report.txt
@@ -450,3 +450,12 @@ eo_data_complete    1000
 ✔ training amount: 387, positive class: 21.4%
 ✔ validation amount: 294, positive class: 15.3%
 ✔ testing amount: 319, positive class: 14.1%
+
+
+
+KenyaCropArea2019 (Timesteps: 24)
+----------------------------------------------------------------------------
+eo_data_complete    544
+✔ training amount: 232, positive class: 23.7%
+✔ validation amount: 149, positive class: 18.1%
+✔ testing amount: 163, positive class: 20.2%
diff --git a/datasets.py b/datasets.py
@@ -540,6 +540,38 @@ def load_labels(self) -> pd.DataFrame:
         return df
 
 
+class KenyaCropArea2019(LabeledDataset):
+    def load_labels(self) -> pd.DataFrame:
+        raw_folder = raw_dir / "Kenya_Crop_Area_2019"
+        df1 = pd.read_csv(
+            raw_folder / "ceo-Kenya-Crop-Area-Estimation-Reference-Sample-2019---Set-1"
+            "-sample-data-2024-06-10.csv"
+        )
+        df2 = pd.read_csv(
+            raw_folder / "ceo-Kenya-Crop-Area-Estimation-Reference-Sample-2019---Set-2"
+            "-sample-data-2024-06-10.csv"
+        )
+        df = pd.concat([df1, df2])
+
+        # Discard rows with no label
+        df = df[~df["Does this point correspond to active cropland?"].isna()].copy()
+        df[CLASS_PROB] = df["Does this point correspond to active cropland?"] == "Crop"
+        df[CLASS_PROB] = df[CLASS_PROB].astype(int)
+        df["num_labelers"] = 1
+        df = df.groupby([LON, LAT], as_index=False, sort=False).agg(
+            {
+                CLASS_PROB: "mean",
+                "num_labelers": "sum",
+                "plotid": join_unique,
+                "sampleid": join_unique,
+                "email": join_unique,
+            }
+        )
+        df[START], df[END] = date(2019, 1, 1), date(2020, 12, 31)
+        df[SUBSET] = train_val_test_split(df.index, 0.3, 0.3)
+        return df
+
+
 datasets: List[LabeledDataset] = [
     CustomLabeledDataset(
         dataset="geowiki_landcover_2017",
@@ -1327,6 +1359,7 @@ def load_labels(self) -> pd.DataFrame:
     Uganda_NorthCEO2022(),
     Uganda_NorthCEO2021(),
     UgandaNorthCEO2019(),
+    KenyaCropArea2019(),
 ]
 
 if __name__ == "__main__":