Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Crop / non-crop labels in China #127

Merged
merged 9 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cropharvest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
EXPORT_END_MONTH = 2
EXPORT_END_DAY = 1

DATASET_VERSION_ID = 7257688
DATASET_VERSION_ID = 10251170
DATASET_URL = f"https://zenodo.org/record/{DATASET_VERSION_ID}"
LABELS_FILENAME = "labels.geojson"
FEATURES_DIR = "features"
Expand Down Expand Up @@ -46,4 +46,4 @@
),
}

TEST_DATASETS = {"Togo": "togo-eval"}
TEST_DATASETS = {"Togo": "togo-eval", "People's Republic of China": "china-crop"}
5 changes: 5 additions & 0 deletions process_labels/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,11 @@
),
"externally_contributed": False,
},
"china-crop": {
"function": loading_funcs.load_china,
"description": "Hand-labelled crop / non crop labels in China",
"externally_contributed": False,
},
}


Expand Down
2 changes: 2 additions & 0 deletions process_labels/loading_funcs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .canada import load_canada
from .germany import load_germany
from .jecam import load_jecam
from .china import load_china


__all__ = [
Expand Down Expand Up @@ -44,4 +45,5 @@
"load_tanzania_ecaas",
"load_tanzania_ceo",
"load_jecam",
"load_china",
]
115 changes: 115 additions & 0 deletions process_labels/loading_funcs/china.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import pandas as pd
import geopandas
from datetime import datetime

from cropharvest.config import EXPORT_END_MONTH, EXPORT_END_DAY
from cropharvest.columns import RequiredColumns

from .utils import LATLON_CRS
from ..utils import DATASET_PATH


def raw_from_ceo():
# this is the code used to go from the CEO (unanonymised) labels
# to the csv in `raw_data`. While we keep this here for posterity,
# the original data is not uploaded to git

def combine_csvs(format_name: str, year: int, name_2=None):
assert str(year) in format_name
if name_2 is not None:
df = join_on_agreement(pd.read_csv(format_name), pd.read_csv(name_2))
else:
df = join_on_agreement(
pd.read_csv(format_name.format(set_val=1)),
pd.read_csv(format_name.format(set_val=2)),
)
df["year"] = year
return df

def join_on_agreement(a, b):
def clean_df(a):
crop_column = None
for crop_column_name in [
"Does this pixel contain active cropland?",
"Does this point fall within active cropland?",
"Does this point lie on active cropland?",
"Does this point contain active cropland?",
]:
if crop_column_name in a.columns:
crop_column = crop_column_name
assert crop_column is not None
a = a[["lon", "lat", crop_column, "collection_time"]]
a = a.rename(columns={crop_column: "crop", "collection_time": "collection_date"})
return a

a = clean_df(a)
b = clean_df(b)
joined = a.merge(b, on=["lat", "lon"], how="inner", suffixes=("", "_y"))
return joined[joined.crop == joined.crop_y][["lon", "lat", "crop", "collection_date"]]

dfs = []
for filename, name_2, year in [
(
"ceo-Liaoning-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2019,
),
(
"ceo-Liaoning-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2019,
),
(
"ceo-Jilin-2017-(Set-2)-sample-data-2021-04-15.csv",
"ceo-Jilin-2017-(Set-1)-sample-data-2021-04-19.csv",
2017,
),
("ceo-HLJ-2019-(Set-{set_val})---v3-sample-data-2022-01-21.csv", None, 2019),
("ceo-Heilongjiang-2016-(Set-{set_val})---v2-sample-data-2022-01-21.csv", None, 2016),
("ceo-Heilongjiang-2017-(Set-{set_val})---v2-sample-data-2022-01-21.csv", None, 2017),
("ceo-Heilongjiang-2018-(Set-{set_val})-sample-data-2021-10-26.csv", None, 2018),
(
"ceo-Jilin-2016-(April-November)-(Set-{set_val})-v2-sample-data-2022-06-27.csv",
None,
2016,
),
(
"ceo-Jilin-2018-(April---November)-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2018,
),
("ceo-Jilin-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv", None, 2019),
(
"ceo-Liaoning-2016-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2016,
),
("ceo-Liaoning-2017-(Set-{set_val})-sample-data-2021-04-09.csv", None, 2017),
(
"ceo-Liaoning-2018-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2018,
),
(
"ceo-Liaoning-2019-April---November-(Set-{set_val})-sample-data-2022-06-27.csv",
None,
2019,
),
]:
dfs.append(combine_csvs(filename, year, name_2))
return pd.concat(dfs)


def load_china() -> geopandas.GeoDataFrame:
df = pd.read_csv(DATASET_PATH / "china" / "combined_and_anonymised_points.csv")
gdf = geopandas.GeoDataFrame(
data=df, geometry=geopandas.points_from_xy(df.lon, df.lat), crs=LATLON_CRS
)
gdf = df.reset_index(drop=True)
gdf[RequiredColumns.IS_CROP] = df.apply(lambda x: 1 if x.crop == "Crop" else 0, axis=1)
gdf[RequiredColumns.INDEX] = df.index
gdf[RequiredColumns.EXPORT_END_DATE] = df.apply(
lambda x: datetime(x.year, EXPORT_END_MONTH, EXPORT_END_DAY), axis=1
)
gdf[RequiredColumns.COLLECTION_DATE] = pd.to_datetime(df.collection_date)
return gdf
Loading