From 7fb7f36ed7cddf5830073e775854036ca736a584 Mon Sep 17 00:00:00 2001 From: Mamzi Bayatpour Date: Wed, 25 Oct 2023 12:30:21 -0700 Subject: [PATCH] TL/MLX5: adding ip over ib mcast helper functions --- src/components/tl/mlx5/Makefile.am | 1 + .../tl/mlx5/mcast/tl_mlx5_mcast_helper.c | 198 ++++++++++++++++++ .../tl/mlx5/mcast/tl_mlx5_mcast_helper.h | 7 +- src/components/tl/mlx5/tl_mlx5.c | 29 +++ src/utils/ucc_compiler_def.h | 1 + 5 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c diff --git a/src/components/tl/mlx5/Makefile.am b/src/components/tl/mlx5/Makefile.am index 11aec4e5b6..2ac9dc91c7 100644 --- a/src/components/tl/mlx5/Makefile.am +++ b/src/components/tl/mlx5/Makefile.am @@ -23,6 +23,7 @@ mcast = \ mcast/p2p/ucc_tl_mlx5_mcast_p2p.c \ mcast/tl_mlx5_mcast_progress.h \ mcast/tl_mlx5_mcast_helper.h \ + mcast/tl_mlx5_mcast_helper.c \ mcast/tl_mlx5_mcast_team.c sources = \ diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c new file mode 100644 index 0000000000..137e49d24e --- /dev/null +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.c @@ -0,0 +1,198 @@ +/** + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "tl_mlx5_mcast_helper.h" +#include +#include +#include + +#define PREF "/sys/class/net/" +#define SUFF "/device/resource" +#define MAX_STR_LEN 128 + +static ucc_status_t ucc_tl_mlx5_get_ipoib_ip(char *ifname, struct sockaddr_storage *addr) +{ + ucc_status_t status = UCC_ERR_NO_RESOURCE; + struct ifaddrs *ifaddr = NULL; + struct ifaddrs *ifa = NULL; + int is_ipv4 = 0; + int family; + int n; + int is_up; + + if (getifaddrs(&ifaddr) == -1) { + return UCC_ERR_NO_RESOURCE; + } + + for (ifa = ifaddr, n = 0; ifa != NULL; ifa=ifa->ifa_next, n++) { + if (ifa->ifa_addr == NULL) { + continue; + } + + family = ifa->ifa_addr->sa_family; + if (family != AF_INET && family != AF_INET6) { + continue; + } + + is_up = (ifa->ifa_flags & IFF_UP) == IFF_UP; + is_ipv4 = (family == AF_INET) ? 1 : 0; + + if (is_up && !strncmp(ifa->ifa_name, ifname, strlen(ifname)) ) { + if (is_ipv4) { + memcpy((struct sockaddr_in *) addr, + (struct sockaddr_in *) ifa->ifa_addr, + sizeof(struct sockaddr_in)); + } else { + memcpy((struct sockaddr_in6 *) addr, + (struct sockaddr_in6 *) ifa->ifa_addr, + sizeof(struct sockaddr_in6)); + } + + status = UCC_OK; + break; + } + } + + freeifaddrs(ifaddr); + return status; +} + +static int cmp_files(char *f1, char *f2) +{ + int answer = 0; + FILE *fp1; + FILE *fp2; + int ch1; + int ch2; + + if ((fp1 = fopen(f1, "r")) == NULL) { + goto out; + } else if ((fp2 = fopen(f2, "r")) == NULL) { + goto close; + } + + do { + ch1 = getc(fp1); + ch2 = getc(fp2); + } while((ch1 != EOF) && (ch2 != EOF) && (ch1 == ch2)); + + + if (ch1 == ch2) { + answer = 1; + } + + if (fclose(fp2) != 0) { + fprintf(stderr, "fclose() failed errno %d\n", errno); + } +close: + if (fclose(fp1) != 0) { + fprintf(stderr, "fclose() failed errno %d\n", errno); + } +out: + return answer; +} + +static int port_from_file(char *port_file) +{ + int res = -1; + char buf1[MAX_STR_LEN]; + char buf2[MAX_STR_LEN]; + FILE *fp; + int len; + + if ((fp = fopen(port_file, "r")) == NULL) { + return -1; + } + + if (fgets(buf1, MAX_STR_LEN - 1, fp) == NULL) { + goto out; + } + + len = strlen(buf1) - 2; + strncpy(buf2, buf1 + 2, len); + buf2[len] = 0; + res = atoi(buf2); + +out: + if (fclose(fp) != 0) { + fprintf(stderr, "fclose() failed errno %d\n", errno); + } + return res; +} + +static ucc_status_t dev2if(char *dev_name, char *port, struct sockaddr_storage + *rdma_src_addr) +{ + ucc_status_t status = UCC_OK; + glob_t glob_el = {0,}; + char dev_file [MAX_STR_LEN]; + char port_file[MAX_STR_LEN]; + char net_file [MAX_STR_LEN]; + char if_name [MAX_STR_LEN]; + char glob_path[MAX_STR_LEN]; + int i; + char **p; + int len; + + sprintf(glob_path, PREF"*"); + + sprintf(dev_file, "/sys/class/infiniband/%s"SUFF, dev_name); + if (glob(glob_path, 0, 0, &glob_el)) { + return UCC_ERR_NO_RESOURCE; + } + p = glob_el.gl_pathv; + + if (glob_el.gl_pathc >= 1) { + for (i = 0; i < glob_el.gl_pathc; i++, p++) { + sprintf(port_file, "%s/dev_id", *p); + sprintf(net_file, "%s"SUFF, *p); + if(cmp_files(net_file, dev_file) && port != NULL && + port_from_file(port_file) == atoi(port) - 1) { + len = strlen(net_file) - strlen(PREF) - strlen(SUFF); + strncpy(if_name, net_file + strlen(PREF), len); + if_name[len] = 0; + + status = ucc_tl_mlx5_get_ipoib_ip(if_name, rdma_src_addr); + if (UCC_OK == status) { + break; + } + } + } + } + + globfree(&glob_el); + return status; +} + +ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(const char* ib_dev, struct + sockaddr_storage *addr) +{ + char *ib = NULL; + char *ib_name = NULL; + char *port = NULL; + ucc_status_t status; + struct sockaddr_storage rdma_src_addr; + + if (NULL == ib_dev) { + status = UCC_ERR_NO_RESOURCE; + } else { + ib = strdup(ib_dev); + if (NULL == ib) { + status = UCC_ERR_NO_MEMORY; + } else { + ucc_string_split(ib, ":", 2, &ib_name, &port); + status = dev2if(ib_name, port, &rdma_src_addr); + ucc_free(ib); + } + } + + if (UCC_OK == status && addr) { + *addr = rdma_src_addr; + } + + return status; +} + diff --git a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h index 9ca529f7b9..3935c4c98c 100644 --- a/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h +++ b/src/components/tl/mlx5/mcast/tl_mlx5_mcast_helper.h @@ -352,7 +352,10 @@ static inline ucc_status_t ucc_tl_mlx5_mcast_reliable(ucc_tl_mlx5_mcast_coll_com return UCC_INPROGRESS; } -ucc_status_t ucc_tl_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm); +ucc_status_t ucc_tl_mlx5_probe_ip_over_ib(const char* ib_dev_list, + struct sockaddr_storage *addr); + +ucc_status_t ucc_tl_mlx5_setup_mcast(ucc_tl_mlx5_mcast_coll_comm_t *comm); ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, ucc_tl_mlx5_mcast_coll_comm_t *comm); @@ -360,6 +363,6 @@ ucc_status_t ucc_tl_mlx5_mcast_init_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, ucc_status_t ucc_tl_mlx5_mcast_setup_qps(ucc_tl_mlx5_mcast_coll_context_t *ctx, ucc_tl_mlx5_mcast_coll_comm_t *comm); -ucc_status_t ucc_tl_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm); +ucc_status_t ucc_tl_mlx5_clean_mcast_comm(ucc_tl_mlx5_mcast_coll_comm_t *comm); #endif /* TL_MLX5_MCAST_HELPER_H_ */ diff --git a/src/components/tl/mlx5/tl_mlx5.c b/src/components/tl/mlx5/tl_mlx5.c index bab4808ece..0210f2302c 100644 --- a/src/components/tl/mlx5/tl_mlx5.c +++ b/src/components/tl/mlx5/tl_mlx5.c @@ -67,6 +67,27 @@ static ucc_config_field_t ucc_tl_mlx5_lib_config_table[] = { ucc_offsetof(ucc_tl_mlx5_lib_config_t, qp_conf.qp_max_atomic), UCC_CONFIG_TYPE_UINT}, + {"MCAST_SX_DEPTH", "512", "Send context depth of the Mcast comm", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_depth), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_SX_INLINE", "128", "Minimal size for inline data send in Mcast", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.sx_inline), + UCC_CONFIG_TYPE_MEMUNITS}, + + {"MCAST_RX_DEPTH", "4096", "Recv context depth of the Mcast comm", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.rx_depth), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_POST_RECV_THRESH", "64", + "Threshold for posting recv into rx ctx of the Mcast comm", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.post_recv_thresh), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_WINDOW_SIZE", "64", "Reliability Mcast window size", + ucc_offsetof(ucc_tl_mlx5_lib_config_t, mcast_conf.wsize), + UCC_CONFIG_TYPE_INT}, + {NULL}}; static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = { @@ -77,6 +98,14 @@ static ucc_config_field_t ucc_tl_mlx5_context_config_table[] = { ucc_offsetof(ucc_tl_mlx5_context_config_t, devices), UCC_CONFIG_TYPE_STRING_ARRAY}, + {"MCAST_TIMEOUT", "10000", "Timeout [usec] for the reliability NACK in Mcast", + ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.timeout), + UCC_CONFIG_TYPE_INT}, + + {"MCAST_NET_DEVICE", "", "Specifies which network device to use for Mcast", + ucc_offsetof(ucc_tl_mlx5_context_config_t, mcast_ctx_conf.ib_dev_name), + UCC_CONFIG_TYPE_STRING}, + {NULL}}; UCC_CLASS_DEFINE_NEW_FUNC(ucc_tl_mlx5_lib_t, ucc_base_lib_t, diff --git a/src/utils/ucc_compiler_def.h b/src/utils/ucc_compiler_def.h index 41d13ecb78..b204df67f3 100644 --- a/src/utils/ucc_compiler_def.h +++ b/src/utils/ucc_compiler_def.h @@ -26,6 +26,7 @@ #define ucc_snprintf_safe snprintf #define ucc_likely ucs_likely #define ucc_unlikely ucs_unlikely +#define ucc_string_split ucs_string_split /** * Prevent compiler from reordering instructions