Skip to content

Commit

Permalink
Merge pull request #127 from nasaharvest/china
Browse files Browse the repository at this point in the history
Crop / non-crop labels in China
  • Loading branch information
gabrieltseng authored Dec 4, 2023
2 parents a701fb0 + b552bb5 commit 731a4ba
Show file tree
Hide file tree
Showing 6 changed files with 5,751 additions and 4 deletions.
4 changes: 2 additions & 2 deletions cropharvest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
EXPORT_END_MONTH = 2
EXPORT_END_DAY = 1

DATASET_VERSION_ID = 7257688
DATASET_VERSION_ID = 10251170
DATASET_URL = f"https://zenodo.org/record/{DATASET_VERSION_ID}"
LABELS_FILENAME = "labels.geojson"
FEATURES_DIR = "features"
Expand Down Expand Up @@ -46,4 +46,4 @@
),
}

TEST_DATASETS = {"Togo": "togo-eval"}
TEST_DATASETS = {"Togo": "togo-eval", "People's Republic of China": "china-crop"}
5 changes: 5 additions & 0 deletions process_labels/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,11 @@
),
"externally_contributed": False,
},
"china-crop": {
"function": loading_funcs.load_china,
"description": "Hand-labelled crop / non crop labels in China",
"externally_contributed": False,
},
}


Expand Down
2 changes: 2 additions & 0 deletions process_labels/loading_funcs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .canada import load_canada
from .germany import load_germany
from .jecam import load_jecam
from .china import load_china


__all__ = [
Expand Down Expand Up @@ -44,4 +45,5 @@
"load_tanzania_ecaas",
"load_tanzania_ceo",
"load_jecam",
"load_china",
]
115 changes: 115 additions & 0 deletions process_labels/loading_funcs/china.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import pandas as pd
import geopandas
from datetime import datetime

from cropharvest.config import EXPORT_END_MONTH, EXPORT_END_DAY
from cropharvest.columns import RequiredColumns

from .utils import LATLON_CRS
from ..utils import DATASET_PATH


def raw_from_ceo():
# this is the code used to go from the CEO (unanonymised) labels
# to the csv in `raw_data`. While we keep this here for posterity,
# the original data is not uploaded to git

def combine_csvs(format_name: str, year: int, name_2=None):
assert str(year) in format_name
if name_2 is not None:
df = join_on_agreement(pd.read_csv(format_name), pd.read_csv(name_2))
else:
df = join_on_agreement(
pd.read_csv(format_name.format(set_val=1)),
pd.read_csv(format_name.format(set_val=2)),
)
df["year"] = year
return df

def join_on_agreement(a, b):
def clean_df(a):
crop_column = None
for crop_column_name in [
"Does this pixel contain active cropland?",
"Does this point fall within active cropland?",
"Does this point lie on active cropland?",
"Does this point contain active cropland?",
]:
if crop_column_name in a.columns:
crop_column = crop_column_name
assert crop_column is not None
a = a[["lon", "lat", crop_column, "collection_time"]]
a = a.rename(columns={crop_column: "crop", "collection_time": "collection_date"})
return a

a = clean_df(a)
b = clean_df(b)
joined = a.merge(b, on=["lat", "lon"], how="inner", suffixes=("", "_y"))
return joined[joined.crop == joined.crop_y][["lon", "lat", "crop", "collection_date"]]

dfs = []
for filename, name_2, year in [
(
"ceo-Liaoning-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2019,
),
(
"ceo-Liaoning-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2019,
),
(
"ceo-Jilin-2017-(Set-2)-sample-data-2021-04-15.csv",
"ceo-Jilin-2017-(Set-1)-sample-data-2021-04-19.csv",
2017,
),
("ceo-HLJ-2019-(Set-{set_val})---v3-sample-data-2022-01-21.csv", None, 2019),
("ceo-Heilongjiang-2016-(Set-{set_val})---v2-sample-data-2022-01-21.csv", None, 2016),
("ceo-Heilongjiang-2017-(Set-{set_val})---v2-sample-data-2022-01-21.csv", None, 2017),
("ceo-Heilongjiang-2018-(Set-{set_val})-sample-data-2021-10-26.csv", None, 2018),
(
"ceo-Jilin-2016-(April-November)-(Set-{set_val})-v2-sample-data-2022-06-27.csv",
None,
2016,
),
(
"ceo-Jilin-2018-(April---November)-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2018,
),
("ceo-Jilin-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv", None, 2019),
(
"ceo-Liaoning-2016-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2016,
),
("ceo-Liaoning-2017-(Set-{set_val})-sample-data-2021-04-09.csv", None, 2017),
(
"ceo-Liaoning-2018-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2018,
),
(
"ceo-Liaoning-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2019,
),
]:
dfs.append(combine_csvs(filename, year, name_2))
return pd.concat(dfs)


def load_china() -> geopandas.GeoDataFrame:
df = pd.read_csv(DATASET_PATH / "china" / "combined_and_anonymised_points.csv")
gdf = geopandas.GeoDataFrame(
data=df, geometry=geopandas.points_from_xy(df.lon, df.lat), crs=LATLON_CRS
)
gdf = df.reset_index(drop=True)
gdf[RequiredColumns.IS_CROP] = df.apply(lambda x: 1 if x.crop == "Crop" else 0, axis=1)
gdf[RequiredColumns.INDEX] = df.index
gdf[RequiredColumns.EXPORT_END_DATE] = df.apply(
lambda x: datetime(x.year, EXPORT_END_MONTH, EXPORT_END_DAY), axis=1
)
gdf[RequiredColumns.COLLECTION_DATE] = pd.to_datetime(df.collection_date)
return gdf
Loading

0 comments on commit 731a4ba

Please sign in to comment.