Skip to content

Commit

Permalink
Prognostic run: restore timings and intermediate restarts (#1994)
Browse files Browse the repository at this point in the history
We currently do not have the ability to log wrapper timing and write out intermediate restarts in the prognostic run, though we've had this in the past. This PR restores those, and cleans up the timing logging as a json. 

Significant internal changes:
- Python wrapper timings logged to json and screen
- Intermediate restarts now written out if specified in namelist
  • Loading branch information
brianhenn authored Aug 11, 2022
1 parent 713ed38 commit 8b937e1
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 32 deletions.
25 changes: 8 additions & 17 deletions workflows/prognostic_c48_run/docs/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,26 @@
Developer's Guide
-----------------

The prognostic run is developed via docker and docker-compose. This
environment is based off the `prognostic_run` docker image, but has
bind-mounts to the packages in "/external" of this repository and this
directory, which allows locally developing this workflow and its
dependencies.
The prognostic run is developed via docker. This environment is based off the
`prognostic_run` docker image, but has bind-mounts to the packages in "/external"
of this repository and this directory, which allows locally developing this workflow
and its dependencies.

It is usually fastest to use the latest docker image from Google Container
Repository. Pull the image::

docker pull us.gcr.io/vcm-ml/prognostic_run:latest
make pull_image_prognostic_run

.. note::

If you run into problems, it would be best to rebuild the docker image from scratch::

docker-compose build fv3
make build_image_prognostic_run

Enter a bash shell in the image::

docker-compose run fv3net bash
make enter_prognostic_run

.. note ::
This docker-compose will propagate key-based authentication to Google
Cloud Platform into the docker image. It expects that environmental variable
``GOOGLE_APPLICATION_CREDENTIALS`` points to a json key. See Google's
`documentation <https://cloud.google.com/iam/docs/creating-managing-service-account-keys>`_
on how to generate one.
Run the tests::
Then run the tests::

pytest
38 changes: 23 additions & 15 deletions workflows/prognostic_c48_run/runtime/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,10 +338,6 @@ def _open_model(self, ml_config: MachineLearningConfig, step: str):
def time(self) -> cftime.DatetimeJulian:
return self._state.time

def cleanup(self):
self._print_global_timings()
self._fv3gfs.cleanup()

def _step_dynamics(self) -> Diagnostics:
self._log_debug(f"Dynamics Step")
self._fv3gfs.step_dynamics()
Expand Down Expand Up @@ -378,28 +374,34 @@ def _apply_physics(self) -> Diagnostics:
"total_precip_after_physics": self._state[TOTAL_PRECIP],
}

def _print_timing(self, name, min_val, max_val, mean_val):
self._print(f"{name:<30}{min_val:15.4f}{max_val:15.4f}{mean_val:15.4f}")

def _print_global_timings(self, root=0):
is_root = self.rank == root
recvbuf = np.array(0.0)
reduced = {}
def _print_timings(self, reduced):
self._print("-----------------------------------------------------------------")
self._print(" Reporting clock statistics from python ")
self._print("-----------------------------------------------------------------")
self._print(f"{' ':<30}{'min (s)':>15}{'max (s)':>15}{'mean (s)':>15}")
for name, timing in reduced.items():
self._print(
f"{name:<30}{timing['min']:15.4f}"
f"{timing['max']:15.4f}{timing['mean']:15.4f}"
)

def log_global_timings(self, root=0):
is_root = self.rank == root
recvbuf = np.array(0.0)
reduced = {}
for name, value in self._timer.times.items():
reduced[name] = {}
for label, op in [("min", MPI.MIN), ("max", MPI.MAX), ("mean", MPI.SUM)]:
self.comm.Reduce(np.array(value), recvbuf, op=op)
if is_root and label == "mean":
recvbuf /= self.comm.Get_size()
reduced[name][label] = recvbuf.copy().item()
self._print_timing(
name, reduced[name]["min"], reduced[name]["max"], reduced[name]["mean"]
)
self._log_info(f"python_timing:{json.dumps(reduced)}")
self._print_timings(reduced)
log_out = {
"steps": reduced,
"units": "[s], cumulative and reduced across ranks",
}
self._log_info(json.dumps({"python_timing": log_out}))

def _step_prephysics(self) -> Diagnostics:

Expand Down Expand Up @@ -529,6 +531,11 @@ def _apply_postphysics_to_dycore_state(self) -> Diagnostics:
)
return diagnostics

def _intermediate_restarts(self) -> Diagnostics:
self._log_info("Saving intermediate restarts if enabled.")
self._fv3gfs.save_intermediate_restart_if_enabled()
return {}

def __iter__(
self,
) -> Iterator[Tuple[cftime.DatetimeJulian, Dict[str, xr.DataArray]]]:
Expand All @@ -551,6 +558,7 @@ def __iter__(
),
self._compute_postphysics,
self.monitor("python", self._apply_postphysics_to_dycore_state),
self._intermediate_restarts,
]:
with self._timer.clock(substep.__name__):
diagnostics.update(substep())
Expand Down
2 changes: 2 additions & 0 deletions workflows/prognostic_c48_run/runtime/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def main():
for diag_file in diag_files:
diag_file.flush()

loop.log_global_timings()


if __name__ == "__main__":

Expand Down

0 comments on commit 8b937e1

Please sign in to comment.