Fast graph generation in fv3fit (#1991)

The current graph generation code in fv3fit is very slow, and requires an external dependency. This PR uses indexing-based logic on a cubed sphere instead to greatly reduce the time needed to generate the graph, and remove the external dependency. Based on and may be merged into #1986 . Significant internal changes: - Greatly reduced build_graph execution time and removed dependency on geopy Requirement changes: - Removed dependency of fv3fit on geopy
ai2cm · Aug 10, 2022 · 713ed38 · 713ed38
1 parent 5385445
commit 713ed38
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 46 deletions.
diff --git a/external/fv3fit/fv3fit/pytorch/graph/graph.py b/external/fv3fit/fv3fit/pytorch/graph/graph.py
@@ -83,19 +83,10 @@ def train_graph_model(
     train_batches: tf.data.Dataset,
     validation_batches: Optional[tf.data.Dataset],
 ):
-
     """
     Train a graph network.
 
     Args:
-        train_batches: training data, as a dataset of Mapping[str, tf.Tensor]
-        validation_batches: validation data, as a dataset of Mapping[str, tf.Tensor]
-        build_model: the function which produces the pytorch model
-            from input and output samples. The models returned must take a list of
-            tensors as input and return a list of tensors as output.
-        input_variables: names of inputs for the pytorch model
-        output_variables: names of outputs for the pytorch model
-        n_epoch: number of epochs
         hyperparameters: configuration for training
         train_batches: training data, as a dataset of Mapping[str, tf.Tensor]
         validation_batches: validation data, as a dataset of Mapping[str, tf.Tensor]
@@ -156,7 +147,7 @@ def build_model(graph, graph_network):
         graph: configuration for building graph
         graph_network: configuration of the graph network
     """
-    graph_data = build_graph(graph)
+    graph_data = build_graph(graph.nx_tile)
     g = dgl.graph(graph_data)
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     train_model = GraphNetwork(

diff --git a/external/fv3fit/fv3fit/pytorch/graph/graph_builder.py b/external/fv3fit/fv3fit/pytorch/graph/graph_builder.py
@@ -1,46 +1,57 @@
-import torch
+from typing import Tuple
 import numpy as np
-import geopy.distance
-from vcm.catalog import catalog
 import dataclasses
+from fv3fit.keras._models.shared.halos import append_halos
+import xarray as xr
 
 
 @dataclasses.dataclass
 class GraphConfig:
     """
     Attributes:
-        neighbor: number of nearest neighbor grids
-        coarsen: 1 if the full model resolution is used, othewise data will be coarsen
-        resolution: Model resolution to load the corresponding lat and lon.
+        nx_tile: number of horizontal grid points on each tile of the cubed sphere
     """
 
-    neighbor: int = 10
-    coarsen: int = 8
-    resolution: str = "grid/c48"
+    # TODO: this should not be configurable, it should be determined
+    # by the shape of the input data
+    nx_tile: int = 48
 
 
-def build_graph(config: GraphConfig) -> tuple:
-    nodes = []
-    edges = []
-
-    grid = catalog[config.resolution].read()
-    lat = grid.lat.load()
-    lon = grid.lon.load()
-
-    lat = lat[:, :: config.coarsen, :: config.coarsen].values.flatten()
-    lon = lon[:, :: config.coarsen, :: config.coarsen].values.flatten()
+def build_graph(nx_tile: int) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Returns two 1D arrays containing the start and end points of edges of the graph.
 
-    for i in range(0, len(lat)):
-        distance = np.zeros([len(lat)])
-        coords_1 = (lat[i], lon[i])
-        for j in range(0, len(lat)):
-            coords_2 = (lat[j], lon[j])
-            distance[j] = geopy.distance.geodesic(coords_1, coords_2).km
-        destination_grids = sorted(range(len(distance)), key=lambda i: distance[i])[
-            : config.neighbor
-        ]
-        nodes.append(np.repeat(i, config.neighbor, axis=0))
-        edges.append(destination_grids)
-    nodes = torch.tensor(nodes).flatten()
-    edges = torch.tensor(edges).flatten()
-    return (nodes, edges)
+    Args:
+        nx_tile: number of horizontal grid points on each tile of the cubed sphere
+    """
+    n_tile, nx, ny = 6, nx_tile, nx_tile
+    n_points = n_tile * nx * ny
+    ds = xr.Dataset(
+        data_vars={
+            "index": xr.DataArray(
+                np.arange(n_points).reshape((n_tile, nx, ny, 1)),
+                dims=["tile", "x", "y", "z"],
+            )
+        }
+    )
+    ds = append_halos(ds, n_halo=1).squeeze("z")
+    index = ds["index"].values
+    total_edges = n_points * 5
+    out = np.empty((total_edges, 2), dtype=int)
+    # left connections
+    out[:n_points, 0] = index[:, 1:-1, 1:-1].flatten()
+    out[:n_points, 1] = index[:, :-2, 1:-1].flatten()
+    # right connections
+    out[n_points : 2 * n_points, 0] = index[:, 1:-1, 1:-1].flatten()
+    out[n_points : 2 * n_points, 1] = index[:, 2:, 1:-1].flatten()
+    # up connections
+    out[2 * n_points : 3 * n_points, 0] = index[:, 1:-1, 1:-1].flatten()
+    out[2 * n_points : 3 * n_points, 1] = index[:, 1:-1, 2:].flatten()
+    # down connections
+    out[3 * n_points : 4 * n_points, 0] = index[:, 1:-1, 1:-1].flatten()
+    out[3 * n_points : 4 * n_points, 1] = index[:, 1:-1, :-2].flatten()
+    # self-connections
+    out[4 * n_points : 5 * n_points, 0] = index[:, 1:-1, 1:-1].flatten()
+    out[4 * n_points : 5 * n_points, 1] = index[:, 1:-1, 1:-1].flatten()
+
+    return out[:, 0], out[:, 1]
diff --git a/external/fv3fit/setup.py b/external/fv3fit/setup.py
@@ -23,12 +23,12 @@
     "torch",
     "torchvision",
     "dgl",
-    "geopy",
     "tensorflow-datasets",
 ]
 
 setup_requirements = []
 
+
 test_requirements = ["pytest"]
 
 setup(

diff --git a/external/fv3fit/tests/training/test_graph.py b/external/fv3fit/tests/training/test_graph.py
@@ -7,6 +7,7 @@
 from fv3fit.pytorch.predict import PytorchModel
 import pytest
 from fv3fit.pytorch.graph.graph import GraphHyperparameters
+from fv3fit.pytorch.graph.graph_builder import build_graph, GraphConfig
 
 GENERAL_TRAINING_TYPES = [
     "graph",
@@ -38,7 +39,9 @@ def train_identity_model(hyperparameters=None):
     np.random.seed(2)
     sample_test = get_uniform_sample_func(size=(grid, nz), low=low, high=high)
     test_dataset = xr.Dataset({"a": sample_test()})
-    hyperparameters = GraphHyperparameters(input_variable, output_variables)
+    hyperparameters = GraphHyperparameters(
+        input_variable, output_variables, graph=GraphConfig(nx_tile=6)
+    )
     train = fv3fit.get_training_function("graph")
     model = train(hyperparameters, train_dataset, val_tfdataset)
     return TrainingResult(model, output_variables, test_dataset, hyperparameters)
@@ -115,3 +118,19 @@ def sample_func():
         )
 
     return sample_func
+
+
+def test_graph_builder():
+    graph = build_graph(2)
+    edges = list(zip(graph[0], graph[1]))
+    assert (0, 1) in edges  # right edge
+    assert (0, 2) in edges  # top edge
+    assert (0, 0) in edges  # self edge
+    # Note due to the [tile, x, y] direction convention for index
+    # ordering, the node index orders are transposed compared to the
+    # MPI rank ordering used for cubed sphere decomposition in fv3gfs.
+    # If the order were [tile, y, x] then the node indices would be
+    # transposed in (x, y) on each tile.
+    assert (0, 21) in edges  # down edge
+    assert (0, 19) in edges  # left edge
+    assert len(edges) == 5 * 6 * 4
diff --git a/external/fv3gfs-fortran b/external/fv3gfs-fortran
+10 −7		.circleci/config.yml
+0 −4		FV3/atmos_cubed_sphere/driver/fvGFS/atmosphere.F90
+8 −0		FV3/atmos_cubed_sphere/model/dyn_core.F90
+8 −1		FV3/atmos_cubed_sphere/model/fv_mapz.F90
+43 −0		FV3/atmos_cubed_sphere/model/sw_core.F90
+29 −14		FV3/atmos_cubed_sphere/tools/coarse_grained_diagnostics.F90
+27 −19		FV3/atmos_cubed_sphere/tools/coarse_grained_restart_files.F90
+5 −21		FV3/atmos_model.F90
+19 −1		FV3/coarse_graining/coarse_graining.F90
+0 −34		FV3/gfsphysics/GFS_layer/GFS_physics_driver.F90
+0 −15		FV3/gfsphysics/GFS_layer/GFS_typedefs.F90
+4 −70		FV3/io/FV3GFS_io.F90
+30 −30		...pytest/_regtest_outputs/test_regression.test_fv3_wrapper_regression[pressure-level-coarse-graining.yml].out
+1 −1		README.md
+2 −4		docker/Dockerfile
+30 −30		...ytest/_regtest_outputs/test_regression.test_regression_native[Linux-pressure-level-coarse-graining.yml].out
+2 −2		tests/pytest/conftest.py
+0 −157		tests/pytest/prescribed_ssts.py
+445 −445		tests/pytest/reference/circleci/default/md5_serialize.txt
+30 −30		tests/pytest/reference/circleci/pressure-level-coarse-graining/md5.txt
+0 −77		tests/pytest/test_regression.py
+3 −3		tests/serialized_test_data_generation/Makefile
+1 −5		tests/serialized_test_data_generation/README.md
+1 −7		tests/serialized_test_data_generation/serialbox_to_netcdf.py