Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enforce consistent altitude, latitude and longitude for site EMOS #1951

Merged
merged 1 commit into from
Oct 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions improver/calibration/dataframe_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,41 @@ def _fill_missing_entries(df, combi_cols, static_cols, site_id_col):
return df


def _ensure_consistent_static_cols(
forecast_df: DataFrame, static_cols: List[str], site_id_col: str
) -> DataFrame:
"""Ensure that the columns expected to have the same value for a given site,
actually have the same values. These "static" columns could change if,
for example, the altitude of a site is corrected.

Args:
forecast_df: Forecast DataFrame.
static_cols: List of columns that are expected to be "static".
site_id_col: The name of the column containing the site ID.

Returns:
Forecast DataFrame with the same value for a given site for the static columns
provided.
"""
# Check if any of the assumed static columns are actually not static when
# the DataFrame is grouped by the site_id_col.
if (forecast_df.groupby(site_id_col)[static_cols].nunique().nunique() > 1).any():

for static_col in static_cols:
# For each static column, find the last value from the list of unique
# values for each site. The last value corresponds to the most recent value
# present when using pd.unique.
temp_df = forecast_df.groupby(site_id_col)[static_col].apply(
lambda x: pd.unique(x)[-1]
)
# Drop the static column and then merge. The merge will recreate the static
# column using a constant value for each site.
forecast_df = forecast_df.drop(columns=static_col)
forecast_df = forecast_df.merge(temp_df, on=site_id_col)

return forecast_df


def _define_time_coord(
adate: pd.Timestamp, time_bounds: Optional[Sequence[pd.Timestamp]] = None,
) -> DimCoord:
Expand Down Expand Up @@ -513,10 +548,15 @@ def _prepare_dataframes(
# Add station_id as a static column, if it is only present in the
# forecast DataFrame.
static_cols.append("station_id")

forecast_df = _fill_missing_entries(
forecast_df, combi_cols, static_cols, site_id_col
)

forecast_df = _ensure_consistent_static_cols(
forecast_df, ["altitude", "latitude", "longitude"], site_id_col
)

combi_cols = [site_id_col, "time"]
static_cols = ["latitude", "longitude", "altitude", "diagnostic"]
if include_station_id:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,61 @@ def test_no_forecasts_for_a_time(self):
self.assertCubeEqual(result[0], self.expected_period_forecast[:, 1:])
self.assertCubeEqual(result[1], self.expected_period_truth[1:])

def test_moving_forecast_site(self):
"""Test that if a site has different values for the altitude, latitude or
longitude at different times within the forecasts, potentially due to a
site update, the altitude, latitude and longitude from the most recent
time point is assigned to all times."""
df = self.forecast_df
condition1 = (df["wmo_id"] == "03002") & (df["time"] == self.time3)
df.loc[condition1, "altitude"] = 5
df.loc[condition1, "latitude"] = 40
df.loc[condition1, "longitude"] = -15

expected_period_forecast = self.expected_period_forecast.copy()
expected_period_forecast.coord("altitude").points[0] = 5
expected_period_forecast.coord("latitude").points[0] = 40
expected_period_forecast.coord("longitude").points[0] = -15
expected_period_truth = self.expected_period_truth.copy()
expected_period_truth.coord("altitude").points[0] = 5
expected_period_truth.coord("latitude").points[0] = 40
expected_period_truth.coord("longitude").points[0] = -15

result = forecast_and_truth_dataframes_to_cubes(
df,
self.truth_subset_df,
self.cycletime,
self.forecast_period,
self.training_length,
)

self.assertEqual(len(result), 2)
self.assertCubeEqual(result[0], expected_period_forecast)
self.assertCubeEqual(result[1], expected_period_truth)

def test_moving_truth_site(self):
"""Test that if a site has different values for the altitude, latitude or
longitude at different times within the truths, this has no effect on the
outputs as only the altitude, latitude and longitude from the forecasts are
preserved."""
df = self.truth_subset_df
condition1 = (df["wmo_id"] == "03002") & (df["time"] == self.time3)
df.loc[condition1, "altitude"] = 5
df.loc[condition1, "latitude"] = 40
df.loc[condition1, "longitude"] = -15

result = forecast_and_truth_dataframes_to_cubes(
self.forecast_df,
df,
self.cycletime,
self.forecast_period,
self.training_length,
)

self.assertEqual(len(result), 2)
self.assertCubeEqual(result[0], self.expected_period_forecast)
self.assertCubeEqual(result[1], self.expected_period_truth)

def test_new_site_with_only_one_forecast_and_truth(self):
"""Test for a site that has a forecast and truth data point for the most
recent time only. Other sites are present at all forecast and truth times.
Expand Down