Skip to content

Commit

Permalink
refactor print summary (#3243)
Browse files Browse the repository at this point in the history
Signed-off-by: Jinzhe Zeng <[email protected]>
  • Loading branch information
njzjz authored Feb 8, 2024
1 parent b7f1239 commit 5ad3d96
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 88 deletions.
46 changes: 46 additions & 0 deletions deepmd/env.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,38 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import logging
import os
from configparser import (
ConfigParser,
)
from pathlib import (
Path,
)
from typing import (
Dict,
Tuple,
)

import numpy as np

import deepmd.lib

__all__ = [
"GLOBAL_NP_FLOAT_PRECISION",
"GLOBAL_ENER_FLOAT_PRECISION",
"global_float_prec",
"GLOBAL_CONFIG",
"SHARED_LIB_MODULE",
"SHARED_LIB_DIR",
]

log = logging.getLogger(__name__)


SHARED_LIB_MODULE = "lib"
SHARED_LIB_DIR = Path(deepmd.lib.__path__[0])
CONFIG_FILE = SHARED_LIB_DIR / "run_config.ini"


# FLOAT_PREC
dp_float_prec = os.environ.get("DP_INTERFACE_PREC", "high").lower()
if dp_float_prec in ("high", ""):
Expand Down Expand Up @@ -111,3 +129,31 @@ def get_default_nthreads() -> Tuple[int, int]:
os.environ.get("TF_INTRA_OP_PARALLELISM_THREADS", "0"),
)
)


def _get_package_constants(
config_file: Path = CONFIG_FILE,
) -> Dict[str, str]:
"""Read package constants set at compile time by CMake to dictionary.
Parameters
----------
config_file : str, optional
path to CONFIG file, by default "run_config.ini"
Returns
-------
Dict[str, str]
dictionary with package constants
"""
if not config_file.is_file():
raise FileNotFoundError(
f"CONFIG file not found at {config_file}. "
"Please check if the package is installed correctly."
)
config = ConfigParser()
config.read(config_file)
return dict(config.items("CONFIG"))


GLOBAL_CONFIG = _get_package_constants()
33 changes: 33 additions & 0 deletions deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import torch
import torch.distributed as dist
import torch.version
from torch.distributed.elastic.multiprocessing.errors import (
record,
)
Expand Down Expand Up @@ -48,6 +49,9 @@
from deepmd.pt.utils.dataloader import (
DpLoaderSet,
)
from deepmd.pt.utils.env import (
DEVICE,
)
from deepmd.pt.utils.finetune import (
change_finetune_model_params,
)
Expand All @@ -57,6 +61,7 @@
from deepmd.pt.utils.stat import (
make_stat_input,
)
from deepmd.utils.summary import SummaryPrinter as BaseSummaryPrinter

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -238,8 +243,36 @@ def prepare_trainer_input_single(
return trainer


class SummaryPrinter(BaseSummaryPrinter):
"""Summary printer for PyTorch."""

def is_built_with_cuda(self) -> bool:
"""Check if the backend is built with CUDA."""
return torch.version.cuda is not None

def is_built_with_rocm(self) -> bool:
"""Check if the backend is built with ROCm."""
return torch.version.hip is not None

def get_compute_device(self) -> str:
"""Get Compute device."""
return str(DEVICE)

def get_ngpus(self) -> int:
"""Get the number of GPUs."""
return torch.cuda.device_count()

def get_backend_info(self) -> dict:
"""Get backend information."""
return {
"Backend": "PyTorch",
"PT ver": f"v{torch.__version__}-g{torch.version.git_version[:11]}",
}


def train(FLAGS):
log.info("Configuration path: %s", FLAGS.INPUT)
SummaryPrinter()()
with open(FLAGS.INPUT) as fin:
config = json.load(fin)
trainer = get_trainer(
Expand Down
6 changes: 0 additions & 6 deletions deepmd/tf/entrypoints/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
Model,
)
from deepmd.tf.train.run_options import (
BUILD,
CITATION,
WELCOME,
RunOptions,
)
from deepmd.tf.train.trainer import (
Expand Down Expand Up @@ -159,9 +156,6 @@ def train(
dtype=tf.string,
)

for message in WELCOME + CITATION + BUILD:
log.info(message)

run_opt.print_resource_summary()
if origin_type_map is not None:
jdata["model"]["origin_type_map"] = origin_type_map
Expand Down
33 changes: 4 additions & 29 deletions deepmd/tf/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
import ctypes
import os
import platform
from configparser import (
ConfigParser,
)
from importlib import (
import_module,
reload,
Expand All @@ -17,18 +14,19 @@
from typing import (
TYPE_CHECKING,
Any,
Dict,
)

import numpy as np
from packaging.version import (
Version,
)

import deepmd.lib
from deepmd.env import (
GLOBAL_CONFIG,
GLOBAL_ENER_FLOAT_PRECISION,
GLOBAL_NP_FLOAT_PRECISION,
SHARED_LIB_DIR,
SHARED_LIB_MODULE,
)
from deepmd.env import get_default_nthreads as get_tf_default_nthreads
from deepmd.env import (
Expand Down Expand Up @@ -112,11 +110,9 @@ def dlopen_library(module: str, filename: str):
"ATTENTION_LAYER_PATTERN",
"REMOVE_SUFFIX_DICT",
"TF_VERSION",
"tf_py_version",
]

SHARED_LIB_MODULE = "lib"
SHARED_LIB_DIR = Path(deepmd.lib.__path__[0])
CONFIG_FILE = SHARED_LIB_DIR / "run_config.ini"

# Python library version
try:
Expand Down Expand Up @@ -398,27 +394,6 @@ def get_module(module_name: str) -> "ModuleType":
return module


def _get_package_constants(
config_file: Path = CONFIG_FILE,
) -> Dict[str, str]:
"""Read package constants set at compile time by CMake to dictionary.
Parameters
----------
config_file : str, optional
path to CONFIG file, by default "run_config.ini"
Returns
-------
Dict[str, str]
dictionary with package constants
"""
config = ConfigParser()
config.read(config_file)
return dict(config.items("CONFIG"))


GLOBAL_CONFIG = _get_package_constants()
if GLOBAL_CONFIG["enable_tensorflow"] == "0":
raise RuntimeError(
"TensorFlow backend is not built. To enable it, "
Expand Down
84 changes: 33 additions & 51 deletions deepmd/tf/train/run_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,57 +22,57 @@
from deepmd.tf.env import (
GLOBAL_CONFIG,
TF_VERSION,
get_tf_default_nthreads,
global_float_prec,
tf,
)
from deepmd.tf.loggers import (
set_log_handles,
)
from deepmd.utils.summary import SummaryPrinter as BaseSummaryPrinter

if TYPE_CHECKING:
import horovod.tensorflow as HVD


__all__ = [
"WELCOME",
"CITATION",
"BUILD",
"RunOptions",
]

log = logging.getLogger(__name__)


# http://patorjk.com/software/taag. Font:Big"
WELCOME = (
r" _____ _____ __ __ _____ _ _ _ ",
r"| __ \ | __ \ | \/ || __ \ | | (_)| | ",
r"| | | | ___ ___ | |__) || \ / || | | | ______ | | __ _ | |_ ",
r"| | | | / _ \ / _ \| ___/ | |\/| || | | ||______|| |/ /| || __|",
r"| |__| || __/| __/| | | | | || |__| | | < | || |_ ",
r"|_____/ \___| \___||_| |_| |_||_____/ |_|\_\|_| \__|",
)
class SummaryPrinter(BaseSummaryPrinter):
"""Summary printer for TensorFlow."""

CITATION = (
"Please read and cite:",
"Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)",
"Zeng et al, J. Chem. Phys., 159, 054801 (2023)",
"See https://deepmd.rtfd.io/credits/ for details.",
)
def __init__(self, compute_device: str, ngpus: int) -> None:
super().__init__()
self.compute_device = compute_device
self.ngpus = ngpus

_sep = "\n "
BUILD = (
f"installed to: {GLOBAL_CONFIG['install_prefix']}",
f"source : {GLOBAL_CONFIG['git_summ']}",
f"source brach: {GLOBAL_CONFIG['git_branch']}",
f"source commit: {GLOBAL_CONFIG['git_hash']}",
f"source commit at: {GLOBAL_CONFIG['git_date']}",
f"build float prec: {global_float_prec}",
f"build variant: {GLOBAL_CONFIG['dp_variant']}",
f"build with tf inc: {GLOBAL_CONFIG['tf_include_dir']}",
f"build with tf lib: {GLOBAL_CONFIG['tf_libs'].replace(';', _sep)}",
)
def is_built_with_cuda(self) -> bool:
"""Check if the backend is built with CUDA."""
return tf.test.is_built_with_cuda()

def is_built_with_rocm(self) -> bool:
"""Check if the backend is built with ROCm."""
return tf.test.is_built_with_rocm()

def get_compute_device(self) -> str:
"""Get Compute device."""
return self.compute_device

def get_ngpus(self) -> int:
"""Get the number of GPUs."""
return self.ngpus

def get_backend_info(self) -> dict:
"""Get backend information."""
return {
"Backend": "TensorFlow",
"TF ver": tf.version.GIT_VERSION,
"build with TF ver": TF_VERSION,
"build with TF inc": GLOBAL_CONFIG["tf_include_dir"].replace(";", "\n"),
"build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"),
}


class RunOptions:
Expand Down Expand Up @@ -148,25 +148,7 @@ def is_chief(self):

def print_resource_summary(self):
"""Print build and current running cluster configuration summary."""
log.info("---Summary of the training---------------------------------------")
if self.is_distrib:
log.info("distributed")
log.info(f"world size: {self.world_size}")
log.info(f"my rank: {self.my_rank}")
log.info(f"node list: {self.nodelist}")
log.info(f"running on: {self.nodename}")
log.info(f"computing device: {self.my_device}")
if tf.test.is_built_with_cuda():
env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset")
log.info(f"CUDA_VISIBLE_DEVICES: {env_value}")
if hasattr(tf.test, "is_built_with_rocm") and tf.test.is_built_with_rocm():
env_value = os.environ.get("HIP_VISIBLE_DEVICES", "unset")
log.info(f"HIP_VISIBLE_DEVICES: {env_value}")
log.info(f"Count of visible GPU: {len(self.gpus or [])}")
intra, inter = get_tf_default_nthreads()
log.info(f"num_intra_threads: {intra:d}")
log.info(f"num_inter_threads: {inter:d}")
log.info("-----------------------------------------------------------------")
SummaryPrinter(self.my_device, len(self.gpus or []))()

def _setup_logger(
self,
Expand Down
2 changes: 1 addition & 1 deletion deepmd/utils/hostlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ def get_host_names() -> Tuple[str, List[str]]:
if comm.Get_size() == 1:
return host_name, [host_name]
host_names = comm.allgather(host_name)
return host_name, list(set(host_names))
return host_name, host_names
Loading

0 comments on commit 5ad3d96

Please sign in to comment.