From 38e2d5b152ac3f85e189dc1fc60a58bb09da7feb Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Wed, 8 May 2024 15:32:14 -0400 Subject: [PATCH 1/9] lib/net: handle bonded interfaces as any other The original version of the bonding driver required applications to directly set many parameters of the members of a bonded interface. Since this is no longer the case, this commit simplifies the initialization code by leveraging the new interface. --- lib/net.c | 876 ++++++++++++++++++++++++------------------------------ 1 file changed, 396 insertions(+), 480 deletions(-) diff --git a/lib/net.c b/lib/net.c index bcfaf5a7..7d92f6e3 100644 --- a/lib/net.c +++ b/lib/net.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -435,11 +436,13 @@ find_num_numa_nodes(void) } static int -configure_queue(struct gatekeeper_if *iface, uint16_t port_id, - uint16_t queue_id, enum queue_type ty, struct rte_mempool *mp) +configure_queue(const struct gatekeeper_if *iface, uint16_t queue_id, + enum queue_type ty, struct rte_mempool *mp) { + uint16_t port_id = iface->id; + /* - * Function slave_configure() of the bond driver (see file + * The bonding driver (see file * dependencies/dpdk/drivers/net/bonding/rte_eth_bond_pmd.c) passes * rte_eth_dev_socket_id(port_id) for the parameter socket_id * of rte_eth_rx_queue_setup() and rte_eth_tx_queue_setup(). @@ -448,7 +451,7 @@ configure_queue(struct gatekeeper_if *iface, uint16_t port_id, * the function rte_eth_dma_zone_reserve() will fail when * when the driver of the NIC calls it. * - * Although this issue is only raised while using the bond driver, + * Although this issue is only raised while using the bonding driver, * it makes sense to have the RX and TX queues on the same * NUMA socket to which the underlying Ethernet device is connected. */ @@ -459,9 +462,9 @@ configure_queue(struct gatekeeper_if *iface, uint16_t port_id, case QUEUE_TYPE_RX: ret = rte_eth_rx_queue_setup(port_id, queue_id, iface->num_rx_desc, numa_node, NULL, mp); - if (ret < 0) { - G_LOG(ERR, "%s(): failed to configure RX queue %hu of port %hhu of interface %s (errno=%d): %s\n", - __func__, queue_id, port_id, iface->name, + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to configure RX queue %u (errno=%i): %s\n", + __func__, iface->name, queue_id, -ret, rte_strerror(-ret)); return ret; } @@ -469,17 +472,17 @@ configure_queue(struct gatekeeper_if *iface, uint16_t port_id, case QUEUE_TYPE_TX: ret = rte_eth_tx_queue_setup(port_id, queue_id, iface->num_tx_desc, numa_node, NULL); - if (ret < 0) { - G_LOG(ERR, "%s(): failed to configure TX queue %hu of port %hhu of interface %s (errno=%d): %s\n", - __func__, queue_id, port_id, iface->name, + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to configure TX queue %u (errno=%d): %s\n", + __func__, iface->name, queue_id, -ret, rte_strerror(-ret)); return ret; } break; default: - G_LOG(ERR, "%s(): unsupported queue type (%d)\n", - __func__, ty); - return -1; + G_LOG(CRIT, "%s(%s): bug: unsupported queue type (%d)\n", + __func__, iface->name, ty); + return -EINVAL; } return 0; @@ -500,72 +503,35 @@ int get_queue_id(struct gatekeeper_if *iface, enum queue_type ty, unsigned int lcore, struct rte_mempool *mp) { - int16_t *queues; + int16_t *queues, new_queue_id; int ret; - uint16_t port; - int16_t new_queue_id; - RTE_VERIFY(lcore < RTE_MAX_LCORE); - RTE_VERIFY(ty < QUEUE_TYPE_MAX); + if (unlikely(lcore >= RTE_MAX_LCORE || ty >= QUEUE_TYPE_MAX)) + return -EINVAL; queues = (ty == QUEUE_TYPE_RX) ? iface->rx_queues : iface->tx_queues; - if (queues[lcore] != GATEKEEPER_QUEUE_UNALLOCATED) goto queue; /* Get next queue identifier. */ new_queue_id = rte_atomic16_add_return(ty == QUEUE_TYPE_RX ? &iface->rx_queue_id : &iface->tx_queue_id, 1); - if (new_queue_id == GATEKEEPER_QUEUE_UNALLOCATED) { - G_LOG(ERR, "net: exhausted all %s queues for the %s interface; this is likely a bug\n", - (ty == QUEUE_TYPE_RX) ? "RX" : "TX", iface->name); - return -1; + if (unlikely(new_queue_id == GATEKEEPER_QUEUE_UNALLOCATED)) { + G_LOG(ERR, "%s(%s): exhausted all %s queues; this is likely a bug\n", + __func__, iface->name, + (ty == QUEUE_TYPE_RX) ? "RX" : "TX"); + return -ENOSPC; } queues[lcore] = new_queue_id; - /* - * Configure this queue on all ports of this interface. - * - * Note that if we are using a bonded port, it is not - * sufficient to only configure the queue on that bonded - * port. All slave ports must be configured and started - * before the bonded port can be started. - */ - for (port = 0; port < iface->num_ports; port++) { - ret = configure_queue(iface, iface->ports[port], - (uint16_t)new_queue_id, ty, mp); - if (ret < 0) - return ret; - } - - /* If there's a bonded port, configure it too. */ - if (iface_bonded(iface)) { - ret = configure_queue(iface, iface->id, - (uint16_t)new_queue_id, ty, mp); - if (ret < 0) - return ret; - } + ret = configure_queue(iface, new_queue_id, ty, mp); + if (unlikely(ret < 0)) + return ret; queue: return queues[lcore]; } -static void -stop_iface_ports(struct gatekeeper_if *iface, uint8_t nb_ports) -{ - uint8_t i; - for (i = 0; i < nb_ports; i++) - rte_eth_dev_stop(iface->ports[i]); -} - -static void -rm_slave_ports(struct gatekeeper_if *iface, uint8_t nb_slave_ports) -{ - uint8_t i; - for (i = 0; i < nb_slave_ports; i++) - rte_eth_bond_member_remove(iface->id, iface->ports[i]); -} - static void close_iface_ports(struct gatekeeper_if *iface, uint8_t nb_ports) { @@ -587,6 +553,33 @@ enum iface_destroy_cmd { IFACE_DESTROY_ALL, }; +static int +bonded_if_name(char *port_name, const struct gatekeeper_if *iface) +{ + /* + * The name of the bonded device must start with the name of + * the bonding driver. Otherwise, DPDK cannot identify + * the correct driver. + * + * The ID of the first port is used instead of the name of + * the interface (i.e. iface->name) because IF_NAMESIZE is + * small. + */ + int ret = snprintf(port_name, IF_NAMESIZE, "net_bonding%u", + iface->ports[0]); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): snprintf() failed (errno=%i): %s\n", + __func__, iface->name, -ret, strerror(-ret)); + return ret; + } + if (unlikely(ret >= IF_NAMESIZE)) { + G_LOG(ERR, "%s(%s): port name is too long (len=%i)\n", + __func__, iface->name, ret); + return -ENOSPC; + } + return 0; +} + static void destroy_iface(struct gatekeeper_if *iface, enum iface_destroy_cmd cmd) { @@ -602,18 +595,18 @@ destroy_iface(struct gatekeeper_if *iface, enum iface_destroy_cmd cmd) destroy_acls(&iface->ipv4_acls); /* FALLTHROUGH */ case IFACE_DESTROY_STOP: - /* Stop interface ports (bonded port is stopped below). */ - stop_iface_ports(iface, iface->num_ports); + rte_eth_dev_stop(iface->id); /* FALLTHROUGH */ case IFACE_DESTROY_INIT: - /* Remove any slave ports added to a bonded port. */ - if (iface_bonded(iface)) - rm_slave_ports(iface, iface->num_ports); /* FALLTHROUGH */ case IFACE_DESTROY_PORTS: /* Stop and close bonded port, if needed. */ - if (iface_bonded(iface)) - rte_eth_bond_free(iface->name); + if (iface_bonded(iface)) { + char if_name[IF_NAMESIZE]; + int ret = bonded_if_name(if_name, iface); + if (likely(ret == 0)) + rte_eth_bond_free(if_name); + } /* Close and free interface ports. */ close_iface_ports(iface, iface->num_ports); @@ -1002,6 +995,48 @@ i40e_disable_ipv6_tcp_udp_ports_from_inset(uint16_t port_id) RTE_DIM(pctypes)); } +static int +randomize_rss_key(struct gatekeeper_if *iface) +{ + uint16_t final_set_count; + unsigned int flags = iface->guarantee_random_entropy ? GRND_RANDOM : 0; + + /* + * To validate if the key generated is reasonable, the + * number of bits set to 1 in the key must be greater than + * 10% and less than 90% of the total bits in the key. + * min_num_set_bits and max_num_set_bits represent the lower + * and upper bound for the key. + */ + const uint16_t min_num_set_bits = iface->rss_key_len * 8 * 0.1; + const uint16_t max_num_set_bits = iface->rss_key_len * 8 * 0.9; + + do { + int number_of_bytes = 0; + uint8_t i; + + /* + * When the last parameter of the system call getrandom() + * (i.e flags) is zero, getrandom() uses the /dev/urandom pool. + */ + do { + int ret = getrandom(iface->rss_key + number_of_bytes, + iface->rss_key_len - number_of_bytes, flags); + if (ret < 0) + return ret; + number_of_bytes += ret; + } while (number_of_bytes < iface->rss_key_len); + + final_set_count = 0; + for (i = 0; i < iface->rss_key_len; i++) { + final_set_count += + __builtin_popcount(iface->rss_key[i]); + } + } while (final_set_count < min_num_set_bits || + final_set_count > max_num_set_bits); + return 0; +} + /* * Split up RTE_ETH_RSS_IP into IPv4-related and IPv6-related hash functions. * For each type of IP being used in Gatekeeper, check the supported @@ -1024,9 +1059,8 @@ i40e_disable_ipv6_tcp_udp_ports_from_inset(uint16_t port_id) RTE_ETH_RSS_IPV6_EX) static int -check_port_rss(struct gatekeeper_if *iface, unsigned int port_idx, - const struct rte_eth_dev_info *dev_info, - struct rte_eth_conf *port_conf) +check_if_rss(struct gatekeeper_if *iface, + const struct rte_eth_dev_info *dev_info, struct rte_eth_conf *port_conf) { uint8_t rss_hash_key[GATEKEEPER_RSS_MAX_KEY_LEN]; struct rte_eth_rss_conf __rss_conf = { @@ -1034,165 +1068,200 @@ check_port_rss(struct gatekeeper_if *iface, unsigned int port_idx, .rss_key_len = sizeof(rss_hash_key), }; uint64_t rss_off = dev_info->flow_type_rss_offloads; - uint16_t port_id = iface->ports[port_idx]; - int ret = rte_eth_dev_rss_hash_conf_get(port_id, &__rss_conf); - if (ret == -ENOTSUP) { - G_LOG(WARNING, "%s(%s): port %hu (%s) does not support to get RSS configuration, disable RSS\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx]); + int ret; + + RTE_BUILD_BUG_ON((GATEKEEPER_IPV4_RSS_HF | GATEKEEPER_IPV6_RSS_HF) != + RTE_ETH_RSS_IP); + + /* + * Set up device RSS. + * + * Assume all ports support RSS until shown otherwise. + * If not, RSS will be disabled and only one queue is used. + * + * Check each port for the RSS hash functions it supports, + * and configure each to use the intersection of supported + * hash functions. + */ + iface->rss = true; + port_conf->rx_adv_conf.rss_conf.rss_hf = 0; + if (ipv4_if_configured(iface)) { + port_conf->rx_adv_conf.rss_conf.rss_hf |= + GATEKEEPER_IPV4_RSS_HF; + if (iface->alternative_rss_hash) + port_conf->rx_adv_conf.rss_conf.rss_hf |= + RTE_ETH_RSS_NONFRAG_IPV4_TCP | + RTE_ETH_RSS_NONFRAG_IPV4_UDP; + } + if (ipv6_if_configured(iface)) { + port_conf->rx_adv_conf.rss_conf.rss_hf |= + GATEKEEPER_IPV6_RSS_HF; + if (iface->alternative_rss_hash) + port_conf->rx_adv_conf.rss_conf.rss_hf |= + RTE_ETH_RSS_NONFRAG_IPV6_TCP | + RTE_ETH_RSS_NONFRAG_IPV6_UDP; + } + + ret = rte_eth_dev_rss_hash_conf_get(iface->id, &__rss_conf); + if (unlikely(ret == -ENOTSUP)) { + G_LOG(WARNING, "%s(%s): interface did not return RSS configuration\n", + __func__, iface->name); goto disable_rss; } /* Do not use @__rss_conf from now on. See issue #624 for details. */ - if (ret < 0) { - G_LOG(ERR, "%s(%s): failed to get RSS hash configuration at port %hu (%s) (errno=%i): %s\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx], - -ret, rte_strerror(-ret)); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to get RSS hash configuration (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); return ret; } RTE_VERIFY(ret == 0); - /* This port doesn't support RSS, so disable RSS. */ - if (rss_off == 0) { - G_LOG(WARNING, "%s(%s): port %hu (%s) does not support RSS\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx]); + /* This interface doesn't support RSS, so disable RSS. */ + if (unlikely(rss_off == 0)) { + G_LOG(WARNING, "%s(%s): interface does not support RSS\n", + __func__, iface->name); goto disable_rss; } /* Does Gatekeeper support the key length of @dev_info? */ - if (dev_info->hash_key_size < GATEKEEPER_RSS_MIN_KEY_LEN || + if (unlikely(dev_info->hash_key_size < GATEKEEPER_RSS_MIN_KEY_LEN || dev_info->hash_key_size > GATEKEEPER_RSS_MAX_KEY_LEN || - dev_info->hash_key_size % 4 != 0) { - G_LOG(WARNING, "%s(%s): port %hu (%s) requires a RSS hash key of %i bytes; Gatekeeper only supports keys of [%i, %i] bytes long that are multiple of 4\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx], - dev_info->hash_key_size, GATEKEEPER_RSS_MIN_KEY_LEN, - GATEKEEPER_RSS_MAX_KEY_LEN); + dev_info->hash_key_size % 4 != 0)) { + G_LOG(WARNING, "%s(%s): interface requires an RSS hash key of %i bytes; Gatekeeper only supports keys of [%i, %i] bytes long that are multiple of 4\n", + __func__, iface->name, dev_info->hash_key_size, + GATEKEEPER_RSS_MIN_KEY_LEN, GATEKEEPER_RSS_MAX_KEY_LEN); goto disable_rss; } + iface->rss_key_len = dev_info->hash_key_size; - /* - * Check that all RSS keys have the same length. - * - * @iface->rss_key_len > GATEKEEPER_RSS_MAX_KEY_LEN when it is - * the first time that check_port_rss() is being called. - */ - if (iface->rss_key_len <= GATEKEEPER_RSS_MAX_KEY_LEN && - iface->rss_key_len != dev_info->hash_key_size) { - G_LOG(WARNING, "%s(%s): port %hu (%s) requires a RSS hash key of %i bytes, but another port requires a key of %i bytes; all ports of the same interface must have the same key length\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx], - dev_info->hash_key_size, iface->rss_key_len); - goto disable_rss; + if (unlikely(iface->alternative_rss_hash && iface_bonded(iface))) { + G_LOG(ERR, "%s(%s): the parameter alternative_rss_hash cannot be true when the interface is bonded\n", + __func__, iface->name); + return -EINVAL; } - iface->rss_key_len = dev_info->hash_key_size; /* Check IPv4 RSS hashes. */ if (port_conf->rx_adv_conf.rss_conf.rss_hf & GATEKEEPER_IPV4_RSS_HF) { - /* No IPv4 hashes are supported, so disable RSS. */ - if ((rss_off & GATEKEEPER_IPV4_RSS_HF) == 0) { - G_LOG(WARNING, "%s(%s): port %hu (%s) does not support any IPv4 related RSS hashes\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx]); + if (unlikely((rss_off & GATEKEEPER_IPV4_RSS_HF) == 0)) { + G_LOG(WARNING, "%s(%s): interface does not support any IPv4 RSS hash\n", + __func__, iface->name); goto disable_rss; } if (iface->alternative_rss_hash) { ret = i40e_disable_ipv4_tcp_udp_ports_from_inset( - port_id); - if (ret < 0) + iface->id); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): i40e_disable_ipv4_tcp_udp_ports_from_inset() failed (errno=%i): %s\n", + __func__, iface->name, + -ret, rte_strerror(-ret)); goto disable_rss; - } else if ((rss_off & RTE_ETH_RSS_IPV4) == 0) { - /* - * The IPv4 hash that we think is typically - * used is not supported, so warn the user. - */ - G_LOG(WARNING, "%s(%s): port %hu (%s) does not support the ETH_RSS_IPV4 hash function. The device may not hash packets to the correct queues. You may try parameter alternative_rss_hash.\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx]); + } + } else if (unlikely((rss_off & RTE_ETH_RSS_IPV4) == 0)) { + G_LOG(WARNING, "%s(%s): interface does not support the ETH_RSS_IPV4 hash function. The device may not hash packets to the correct queues; you may try the parameter alternative_rss_hash\n", + __func__, iface->name); } } /* Check IPv6 RSS hashes. */ if (port_conf->rx_adv_conf.rss_conf.rss_hf & GATEKEEPER_IPV6_RSS_HF) { - /* No IPv6 hashes are supported, so disable RSS. */ - if ((rss_off & GATEKEEPER_IPV6_RSS_HF) == 0) { - G_LOG(WARNING, "%s(%s): port %hu (%s) does not support any IPv6 related RSS hashes\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx]); + if (unlikely((rss_off & GATEKEEPER_IPV6_RSS_HF) == 0)) { + G_LOG(WARNING, "%s(%s): interface does not support any IPv6 RSS hash\n", + __func__, iface->name); goto disable_rss; } if (iface->alternative_rss_hash) { ret = i40e_disable_ipv6_tcp_udp_ports_from_inset( - port_id); - if (ret < 0) + iface->id); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): i40e_disable_ipv6_tcp_udp_ports_from_inset() failed (errno=%i): %s\n", + __func__, iface->name, + -ret, rte_strerror(-ret)); goto disable_rss; - } else if ((rss_off & RTE_ETH_RSS_IPV6) == 0) { - /* - * The IPv6 hash that we think is typically - * used is not supported, so warn the user. - */ - G_LOG(WARNING, "%s(%s): port %hu (%s) does not support the ETH_RSS_IPV6 hash function. The device may not hash packets to the correct queues. You may try parameter alternative_rss_hash.\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx]); + } + } else if (unlikely((rss_off & RTE_ETH_RSS_IPV6) == 0)) { + G_LOG(WARNING, "%s(%s): interface does not support the ETH_RSS_IPV6 hash function. The device may not hash packets to the correct queues; you may try the parameter alternative_rss_hash\n", + __func__, iface->name); } } /* - * Any missing hashes that will cause RSS to definitely fail + * Any missing hash that will cause RSS to definitely fail * or are likely to cause RSS to fail are handled above. * Here, also log if the device doesn't support any of the requested * hashes, including the hashes considered non-essential. */ if ((rss_off & port_conf->rx_adv_conf.rss_conf.rss_hf) != port_conf->rx_adv_conf.rss_conf.rss_hf) { - G_LOG(WARNING, "%s(%s): port %hu (%s) only supports RSS hash functions 0x%"PRIx64", but Gatekeeper asks for 0x%"PRIx64"\n", - __func__, iface->name, port_id, - iface->pci_addrs[port_idx], rss_off, + G_LOG(WARNING, "%s(%s): interface only supports RSS hash functions 0x%"PRIx64", but Gatekeeper asks for 0x%"PRIx64"\n", + __func__, iface->name, rss_off, port_conf->rx_adv_conf.rss_conf.rss_hf); } - port_conf->rx_adv_conf.rss_conf.rss_hf &= rss_off; + + ret = randomize_rss_key(iface); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to initialize RSS key (errno=%i): %s\n", + __func__, iface->name, -ret, strerror(-ret)); + return ret; + } + + /* Convert RSS key. */ + RTE_VERIFY(iface->rss_key_len % 4 == 0); + rte_convert_rss_key((uint32_t *)iface->rss_key, + (uint32_t *)iface->rss_key_be, iface->rss_key_len); + + port_conf->rxmode.mq_mode = RTE_ETH_MQ_RX_RSS; + port_conf->rx_adv_conf.rss_conf.rss_key = iface->rss_key; + port_conf->rx_adv_conf.rss_conf.rss_key_len = iface->rss_key_len; return 0; disable_rss: iface->rss = false; port_conf->rx_adv_conf.rss_conf.rss_hf = 0; + iface->num_rx_queues = 1; + G_LOG(WARNING, "%s(%s): the interface does not have RSS capabilities; the GK or GT block will receive all packets and send them to the other blocks as needed. Gatekeeper or Grantor should only be run with one lcore dedicated to GK or GT in this mode; restart with only one GK or GT lcore if necessary\n", + __func__, iface->name); return 0; } static int -check_port_mtu(struct gatekeeper_if *iface, unsigned int port_idx, +check_if_mtu(struct gatekeeper_if *iface, const struct rte_eth_dev_info *dev_info, struct rte_eth_conf *port_conf) { - if (dev_info->min_mtu > port_conf->rxmode.mtu) { - G_LOG(ERR, "%s(%s): the minimum MTU %"PRIu32" of port %hu (%s) is larger than the configured MTU %"PRIu32"\n", + /* + * Set up device MTU. + * + * If greater than the size of the mbufs, then add the + * multi-segment buffer flag. + */ + port_conf->rxmode.mtu = iface->mtu; + if (iface->mtu > RTE_MBUF_DEFAULT_BUF_SIZE) + port_conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; + + if (unlikely(dev_info->min_mtu > port_conf->rxmode.mtu)) { + G_LOG(ERR, "%s(%s): the minimum MTU %u is larger than the configured MTU %"PRIu32"\n", __func__, iface->name, - dev_info->min_mtu, - iface->ports[port_idx], iface->pci_addrs[port_idx], - port_conf->rxmode.mtu); - return -1; + dev_info->min_mtu, port_conf->rxmode.mtu); + return -EINVAL; } - if (dev_info->max_mtu < port_conf->rxmode.mtu) { - G_LOG(ERR, "%s(%s): the maximum MTU %"PRIu32" of port %hu (%s) is smaller than the configured MTU %"PRIu32"\n", + if (unlikely(dev_info->max_mtu < port_conf->rxmode.mtu)) { + G_LOG(ERR, "%s(%s): the maximum MTU %u is smaller than the configured MTU %"PRIu32"\n", __func__, iface->name, - dev_info->max_mtu, - iface->ports[port_idx], iface->pci_addrs[port_idx], - port_conf->rxmode.mtu); - return -1; + dev_info->max_mtu, port_conf->rxmode.mtu); + return -EINVAL; } - if ((port_conf->txmode.offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS) && + if (unlikely((port_conf->txmode.offloads & + RTE_ETH_TX_OFFLOAD_MULTI_SEGS) && !(dev_info->tx_offload_capa & - RTE_ETH_TX_OFFLOAD_MULTI_SEGS)) { - G_LOG(NOTICE, "%s(%s): port %hu (%s) doesn't support offloading multi-segment TX buffers\n", - __func__, iface->name, - iface->ports[port_idx], iface->pci_addrs[port_idx]); + RTE_ETH_TX_OFFLOAD_MULTI_SEGS))) { + G_LOG(NOTICE, "%s(%s): interface does not support offloading multi-segment TX buffers\n", + __func__, iface->name); port_conf->txmode.offloads &= ~RTE_ETH_TX_OFFLOAD_MULTI_SEGS; } @@ -1200,130 +1269,9 @@ check_port_mtu(struct gatekeeper_if *iface, unsigned int port_idx, } static int -check_port_cksum(struct gatekeeper_if *iface, unsigned int port_idx, +check_if_checksums(struct gatekeeper_if *iface, const struct rte_eth_dev_info *dev_info, struct rte_eth_conf *port_conf) { - if ((port_conf->txmode.offloads & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) && - !(dev_info->tx_offload_capa & - RTE_ETH_TX_OFFLOAD_IPV4_CKSUM)) { - G_LOG(NOTICE, "%s(%s): port %hu (%s) doesn't support offloading IPv4 checksumming; will use software IPv4 checksums\n", - __func__, iface->name, - iface->ports[port_idx], iface->pci_addrs[port_idx]); - port_conf->txmode.offloads &= ~RTE_ETH_TX_OFFLOAD_IPV4_CKSUM; - iface->ipv4_hw_cksum = false; - } - - if ((port_conf->txmode.offloads & RTE_ETH_TX_OFFLOAD_UDP_CKSUM) && - !(dev_info->tx_offload_capa & - RTE_ETH_TX_OFFLOAD_UDP_CKSUM)) { - G_LOG(NOTICE, "%s(%s): port %hu (%s) doesn't support offloading UDP checksumming; will use software UDP checksums\n", - __func__, iface->name, - iface->ports[port_idx], iface->pci_addrs[port_idx]); - port_conf->txmode.offloads &= ~RTE_ETH_TX_OFFLOAD_UDP_CKSUM; - iface->ipv4_hw_udp_cksum = false; - iface->ipv6_hw_udp_cksum = false; - } - - return 0; -} - -static int -randomize_rss_key(struct gatekeeper_if *iface) -{ - uint16_t final_set_count; - unsigned int flags = iface->guarantee_random_entropy ? GRND_RANDOM : 0; - - /* - * To validate if the key generated is reasonable, the - * number of bits set to 1 in the key must be greater than - * 10% and less than 90% of the total bits in the key. - * min_num_set_bits and max_num_set_bits represent the lower - * and upper bound for the key. - */ - const uint16_t min_num_set_bits = iface->rss_key_len * 8 * 0.1; - const uint16_t max_num_set_bits = iface->rss_key_len * 8 * 0.9; - - do { - int number_of_bytes = 0; - uint8_t i; - - /* - * When the last parameter of the system call getrandom() - * (i.e flags) is zero, getrandom() uses the /dev/urandom pool. - */ - do { - int ret = getrandom(iface->rss_key + number_of_bytes, - iface->rss_key_len - number_of_bytes, flags); - if (ret < 0) - return ret; - number_of_bytes += ret; - } while (number_of_bytes < iface->rss_key_len); - - final_set_count = 0; - for (i = 0; i < iface->rss_key_len; i++) { - final_set_count += - __builtin_popcount(iface->rss_key[i]); - } - } while (final_set_count < min_num_set_bits || - final_set_count > max_num_set_bits); - return 0; -} - -static int -check_port_offloads(struct gatekeeper_if *iface, - struct rte_eth_conf *port_conf) -{ - unsigned int i; - int ret; - - RTE_BUILD_BUG_ON((GATEKEEPER_IPV4_RSS_HF | GATEKEEPER_IPV6_RSS_HF) != - RTE_ETH_RSS_IP); - - /* - * Set up device RSS. - * - * Assume all ports support RSS until shown otherwise. - * If not, RSS will be disabled and only one queue is used. - * - * Check each port for the RSS hash functions it supports, - * and configure each to use the intersection of supported - * hash functions. - */ - iface->rss = true; - /* - * The +1 makes @iface->rss_key_len invalid and helps check_port_rss() - * to identify the first RSS key length. - */ - iface->rss_key_len = GATEKEEPER_RSS_MAX_KEY_LEN + 1; - port_conf->rx_adv_conf.rss_conf.rss_hf = 0; - if (ipv4_if_configured(iface)) { - port_conf->rx_adv_conf.rss_conf.rss_hf |= - GATEKEEPER_IPV4_RSS_HF; - if (iface->alternative_rss_hash) - port_conf->rx_adv_conf.rss_conf.rss_hf |= - RTE_ETH_RSS_NONFRAG_IPV4_TCP | - RTE_ETH_RSS_NONFRAG_IPV4_UDP; - } - if (ipv6_if_configured(iface)) { - port_conf->rx_adv_conf.rss_conf.rss_hf |= - GATEKEEPER_IPV6_RSS_HF; - if (iface->alternative_rss_hash) - port_conf->rx_adv_conf.rss_conf.rss_hf |= - RTE_ETH_RSS_NONFRAG_IPV6_TCP | - RTE_ETH_RSS_NONFRAG_IPV6_UDP; - } - - /* - * Set up device MTU. - * - * If greater than the size of the mbufs, then add the - * multi-segment buffer flag. This is optional and - * if any ports don't support it, it will be removed. - */ - port_conf->rxmode.mtu = iface->mtu; - if (iface->mtu > RTE_MBUF_DEFAULT_BUF_SIZE) - port_conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_MULTI_SEGS; - /* * Set up checksumming. * @@ -1334,7 +1282,7 @@ check_port_offloads(struct gatekeeper_if *iface, * * In both cases, we set up the devices to assume that * IPv4 and UDP checksumming are supported unless querying - * the device in check_port_cksum() shows otherwise. + * the device shows otherwise. * * Note that the IPv4 checksum field is only computed over * the IPv4 header and the UDP checksum is computed over an IPv4 @@ -1350,58 +1298,52 @@ check_port_offloads(struct gatekeeper_if *iface, (iface->ipv4_hw_udp_cksum || iface->ipv6_hw_udp_cksum)) port_conf->txmode.offloads |= RTE_ETH_TX_OFFLOAD_UDP_CKSUM; - for (i = 0; i < iface->num_ports; i++) { - struct rte_eth_dev_info dev_info; - uint16_t port_id = iface->ports[i]; - - ret = rte_eth_dev_info_get(port_id, &dev_info); - if (ret < 0) { - G_LOG(ERR, "%s(%s): cannot obtain information on port %hu (%s) (errno=%i): %s\n", - __func__, iface->name, - port_id, iface->pci_addrs[i], - -ret, rte_strerror(-ret)); - return ret; - } + if (unlikely((port_conf->txmode.offloads & + RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) && + !(dev_info->tx_offload_capa & + RTE_ETH_TX_OFFLOAD_IPV4_CKSUM))) { + G_LOG(NOTICE, "%s(%s): interface does not support offloading IPv4 checksumming; using software IPv4 checksums\n", + __func__, iface->name); + port_conf->txmode.offloads &= ~RTE_ETH_TX_OFFLOAD_IPV4_CKSUM; + iface->ipv4_hw_cksum = false; + } - /* Check for RSS capabilities and offloads. */ - ret = check_port_rss(iface, i, &dev_info, port_conf); - if (ret < 0) - return ret; + if (unlikely((port_conf->txmode.offloads & + RTE_ETH_TX_OFFLOAD_UDP_CKSUM) && + !(dev_info->tx_offload_capa & + RTE_ETH_TX_OFFLOAD_UDP_CKSUM))) { + G_LOG(NOTICE, "%s(%s): interface does not support offloading UDP checksumming; using software UDP checksums\n", + __func__, iface->name); + port_conf->txmode.offloads &= ~RTE_ETH_TX_OFFLOAD_UDP_CKSUM; + iface->ipv4_hw_udp_cksum = false; + iface->ipv6_hw_udp_cksum = false; + } - /* Check for MTU capability and offloads. */ - ret = check_port_mtu(iface, i, &dev_info, port_conf); - if (ret < 0) - return ret; + return 0; +} - /* Check for checksum capability and offloads. */ - ret = check_port_cksum(iface, i, &dev_info, port_conf); - if (ret < 0) - return ret; +static int +check_if_offloads(struct gatekeeper_if *iface, struct rte_eth_conf *port_conf) +{ + struct rte_eth_dev_info dev_info; + int ret = rte_eth_dev_info_get(iface->id, &dev_info); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot obtain interface information (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); + return ret; } - if (iface->rss) { - ret = randomize_rss_key(iface); - if (ret < 0) { - G_LOG(ERR, "%s(%s): failed to initialize RSS key (errno=%i): %s\n", - __func__, iface->name, -ret, strerror(-ret)); - return ret; - } + ret = check_if_rss(iface, &dev_info, port_conf); + if (unlikely(ret < 0)) + return ret; - /* Convert RSS key. */ - RTE_VERIFY(iface->rss_key_len % 4 == 0); - rte_convert_rss_key((uint32_t *)iface->rss_key, - (uint32_t *)iface->rss_key_be, iface->rss_key_len); + ret = check_if_mtu(iface, &dev_info, port_conf); + if (unlikely(ret < 0)) + return ret; - port_conf->rxmode.mq_mode = RTE_ETH_MQ_RX_RSS; - port_conf->rx_adv_conf.rss_conf.rss_key = iface->rss_key; - port_conf->rx_adv_conf.rss_conf.rss_key_len = - iface->rss_key_len; - } else { - /* Configured hash functions are not supported. */ - G_LOG(WARNING, "%s(%s): the interface does not have RSS capabilities; the GK or GT block will receive all packets and send them to the other blocks as needed. Gatekeeper or Grantor should only be run with one lcore dedicated to GK or GT in this mode; restart with only one GK or GT lcore if necessary\n", - __func__, iface->name); - iface->num_rx_queues = 1; - } + ret = check_if_checksums(iface, &dev_info, port_conf); + if (unlikely(ret < 0)) + return ret; return 0; } @@ -1557,24 +1499,64 @@ gatekeeper_setup_user(struct net_config *net_conf, const char *user) return 0; } -/* - * @port_idx represents index of port in iface->ports when it is >= 0. - * When it is -1, @port_id is a bonded port and has no entry in iface->ports. - */ static int -init_port(struct gatekeeper_if *iface, uint16_t port_id, int port_idx, - const struct rte_eth_conf *port_conf) +create_bond(struct gatekeeper_if *iface) { - int ret = rte_eth_dev_configure(port_id, iface->num_rx_queues, - iface->num_tx_queues, port_conf); - if (ret < 0) { - G_LOG(ERR, "net: failed to configure port %hu (%s) on the %s interface (err=%d)\n", - port_id, - port_idx >= 0 ? iface->pci_addrs[port_idx] : "bonded", - iface->name, ret); + char dev_name[IF_NAMESIZE]; + unsigned int i; + int ret2, ret = bonded_if_name(dev_name, iface); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot name bonded port (errno=%i): %s\n", + __func__, iface->name, -ret, strerror(-ret)); return ret; } + + ret = rte_eth_bond_create(dev_name, iface->bonding_mode, + rte_eth_dev_socket_id(iface->ports[0])); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to create bonded port (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); + return ret; + } + iface->id = ret; + + if (__lacp_enabled(iface)) { + /* + * If LACP is enabled, enable multicast addresses. + * Otherwise, rx_burst_8023ad() of DPDK's bonding driver + * (see rte_eth_bond_pmd.c) is going to discard + * multicast Ethernet packets such as ARP and + * ND packets. + */ + ret = rte_eth_allmulticast_enable(iface->id); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot enable multicast on bond device (errno=%i): %s\n", + __func__, iface->name, + -ret, rte_strerror(-ret)); + goto close_bond; + } + } + + /* Add members to bond. */ + for (i = 0; i < iface->num_ports; i++) { + ret = rte_eth_bond_member_add(iface->id, iface->ports[i]); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to add member %u (errno=%i): %s\n", + __func__, iface->name, iface->ports[i], + -ret, rte_strerror(-ret)); + goto close_bond; + } + } + return 0; + +close_bond: + ret2 = rte_eth_bond_free(dev_name); + if (unlikely(ret2 < 0)) { + G_LOG(WARNING, "%s(%s): rte_eth_bond_free() failed (errno=%i): %s\n", + __func__, iface->name, -ret2, rte_strerror(-ret2)); + } + return ret; } static int @@ -1586,9 +1568,8 @@ init_iface(struct gatekeeper_if *iface) }, /* Other offloads configured below. */ }; + unsigned int i; int ret; - uint8_t i; - uint8_t num_succ_ports; iface->alive = true; @@ -1602,7 +1583,7 @@ init_iface(struct gatekeeper_if *iface) iface->ports = rte_calloc("ports", iface->num_ports, sizeof(*iface->ports), 0); - if (iface->ports == NULL) { + if (unlikely(iface->ports == NULL)) { G_LOG(ERR, "%s(%s): out of memory for ports\n", __func__, iface->name); destroy_iface(iface, IFACE_DESTROY_LUA); @@ -1613,7 +1594,7 @@ init_iface(struct gatekeeper_if *iface) for (i = 0; i < iface->num_ports; i++) { ret = rte_eth_dev_get_port_by_name(iface->pci_addrs[i], &iface->ports[i]); - if (ret < 0) { + if (unlikely(ret < 0)) { G_LOG(ERR, "%s(%s): failed to map PCI %s to a port (errno=%i): %s\n", __func__, iface->name, iface->pci_addrs[i], -ret, rte_strerror(-ret)); @@ -1621,79 +1602,30 @@ init_iface(struct gatekeeper_if *iface) } } - /* Make sure the ports support hardware offloads. */ - ret = check_port_offloads(iface, &port_conf); - if (ret < 0) { - G_LOG(ERR, "%s(%s): interface doesn't support a critical hardware capability\n", - __func__, iface->name); - goto free_ports; - } - - num_succ_ports = 0; - for (i = 0; i < iface->num_ports; i++) { - ret = init_port(iface, iface->ports[i], i, &port_conf); - if (ret < 0) - goto close_partial; - num_succ_ports++; - } - /* Initialize bonded port, if needed. */ - if (!iface_bonded(iface)) + if (!iface_bonded(iface)) { + RTE_VERIFY(iface->num_ports == 1); iface->id = iface->ports[0]; - else { - uint8_t num_slaves_added; - char dev_name[64]; - ret = snprintf(dev_name, sizeof(dev_name), "net_bonding%s", - iface->name); - RTE_VERIFY(ret > 0 && ret < (int)sizeof(dev_name)); - ret = rte_eth_bond_create(dev_name, iface->bonding_mode, 0); - if (ret < 0) { - G_LOG(ERR, "%s(%s): failed to create bonded port (errno=%i): %s\n", - __func__, iface->name, - -ret, rte_strerror(-ret)); - goto close_partial; - } - - iface->id = (uint8_t)ret; - - /* - * If LACP is enabled, enable multicast addresses. - * Otherwise, rx_burst_8023ad() of DPDK's bonding driver - * (see rte_eth_bond_pmd.c) is going to discard - * multicast Ethernet packets such as ARP and ND packets. - */ - if (__lacp_enabled(iface)) { - ret = rte_eth_allmulticast_enable(iface->id); - if (ret < 0) { - G_LOG(ERR, "%s(%s): cannot enable multicast on bond device (errno=%i): %s\n", - __func__, iface->name, - -ret, rte_strerror(-ret)); - goto close_ports; - } - } + } else { + ret = create_bond(iface); + if (unlikely(ret < 0)) + goto free_ports; + } - /* - * Bonded port inherits RSS and offload settings - * from the slave ports added to it. - */ - num_slaves_added = 0; - for (i = 0; i < iface->num_ports; i++) { - ret = rte_eth_bond_member_add(iface->id, - iface->ports[i]); - if (ret < 0) { - G_LOG(ERR, "%s(%s): failed to add slave port %hhu to bonded port %hhu (errno=%i): %s\n", - __func__, iface->name, - iface->ports[i], iface->id, - -ret, rte_strerror(-ret)); - rm_slave_ports(iface, num_slaves_added); - goto close_ports; - } - num_slaves_added++; - } + /* Make sure the interface supports hardware offloads. */ + ret = check_if_offloads(iface, &port_conf); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): interface doesn't support a critical hardware capability (errno=%i): %s\n", + __func__, iface->name, -ret, strerror(-ret)); + goto close_ports; + } - ret = init_port(iface, iface->id, -1, &port_conf); - if (ret < 0) - goto close_ports; + ret = rte_eth_dev_configure(iface->id, iface->num_rx_queues, + iface->num_tx_queues, &port_conf); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to configure interface (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); + goto close_ports; } return 0; @@ -1701,8 +1633,6 @@ init_iface(struct gatekeeper_if *iface) close_ports: destroy_iface(iface, IFACE_DESTROY_PORTS); return ret; -close_partial: - close_iface_ports(iface, num_succ_ports); free_ports: rte_free(iface->ports); iface->ports = NULL; @@ -1845,7 +1775,7 @@ setup_ipv6_addrs(struct gatekeeper_if *iface) } static int -check_port_rss_key_update(struct gatekeeper_if *iface, uint16_t port_id) +check_if_rss_key_update(const struct gatekeeper_if *iface) { struct rte_eth_dev_info dev_info; uint8_t rss_hash_key[GATEKEEPER_RSS_MAX_KEY_LEN]; @@ -1858,34 +1788,32 @@ check_port_rss_key_update(struct gatekeeper_if *iface, uint16_t port_id) if (!iface->rss) return 0; - ret = rte_eth_dev_info_get(port_id, &dev_info); - if (ret < 0) { - G_LOG(ERR, "%s(%s): cannot obtain information on port %hu (errno=%i): %s\n", - __func__, iface->name, port_id, - -ret, rte_strerror(-ret)); + ret = rte_eth_dev_info_get(iface->id, &dev_info); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot obtain interface information (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); return ret; } - ret = rte_eth_dev_rss_hash_conf_get(port_id, &rss_conf); + ret = rte_eth_dev_rss_hash_conf_get(iface->id, &rss_conf); switch (ret) { case 0: break; case -ENODEV: - G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration at port %d: port identifier is invalid\n", - __func__, iface->name, port_id); + G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration: interface identifier is invalid\n", + __func__, iface->name); return ret; case -EIO: - G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration at port %d: device is removed\n", - __func__, iface->name, port_id); + G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration: device is removed\n", + __func__, iface->name); return ret; case -ENOTSUP: - G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration at port %d: hardware does not support RSS\n", - __func__, iface->name, port_id); + G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration: hardware does not support RSS\n", + __func__, iface->name); return ret; default: - G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration at port %d (errno=%i): %s\n", - __func__, iface->name, port_id, - -ret, rte_strerror(-ret)); + G_LOG(WARNING, "%s(%s): failed to get RSS hash configuration (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); return ret; } @@ -1896,8 +1824,8 @@ check_port_rss_key_update(struct gatekeeper_if *iface, uint16_t port_id) if (unlikely(dev_info.hash_key_size != iface->rss_key_len || memcmp(rss_conf.rss_key, iface->rss_key, iface->rss_key_len) != 0)) { - G_LOG(WARNING, "%s(%s): the RSS hash configuration obtained at port %d does not match the expected RSS configuration\n", - __func__, iface->name, port_id); + G_LOG(WARNING, "%s(%s): the obtained RSS hash configuration does not match the expected RSS configuration\n", + __func__, iface->name); return -EINVAL; } @@ -1907,55 +1835,45 @@ check_port_rss_key_update(struct gatekeeper_if *iface, uint16_t port_id) static int start_iface(struct gatekeeper_if *iface, unsigned int num_attempts_link_get) { - int ret; - uint8_t i; - uint8_t num_succ_ports; - - num_succ_ports = 0; - for (i = 0; i < iface->num_ports; i++) { - ret = start_port(iface->ports[i], - &num_succ_ports, num_attempts_link_get); - if (ret < 0) - goto stop_partial; - - /* - * If we try to update/get the RSS hash configuration before - * the start of the NICs, no meaningful operations will be - * done even the return values indicate no errors. - * - * After checking the source code of DPDK library, - * it turns out that RSS is disabled in the MRQC register - * before we start the NICs. - * - * Only after the NICs start, we can check whether the RSS hash - * is configured correctly or not. - */ - if (check_port_rss_key_update(iface, iface->ports[i]) != 0) { - G_LOG(ERR, "%s(%s): port %hu (%s) does not have the correct RSS hash key\n", - __func__, iface->name, - iface->ports[i], iface->pci_addrs[i]); - ret = -1; - goto stop_partial; - } + int ret = start_port(iface->id, NULL, num_attempts_link_get); + if (unlikely(ret < 0)) { + destroy_iface(iface, IFACE_DESTROY_INIT); + return ret; } - /* Bonding port(s). */ - if (iface_bonded(iface)) { - ret = start_port(iface->id, NULL, num_attempts_link_get); - if (ret < 0) - goto stop_partial; + /* + * If we try to update/get the RSS hash configuration before + * the start of the NICs, no meaningful operations will be + * done; even the return values indicate no errors. + * + * After checking the source code of DPDK library, + * it turns out that RSS is disabled in the MRQC register + * before we start the NICs. + * + * Only after the NICs start, we can check whether the RSS hash + * is configured correctly or not. + */ + ret = check_if_rss_key_update(iface); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): port does not have the correct RSS hash key (errno=%i): %s\n", + __func__, iface->name, -ret, strerror(-ret)); + goto stop; } - rte_eth_macaddr_get(iface->id, &iface->eth_addr); + ret = rte_eth_macaddr_get(iface->id, &iface->eth_addr); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot get MAC address (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); + goto stop; + } if (ipv6_if_configured(iface)) setup_ipv6_addrs(iface); return 0; -stop_partial: - stop_iface_ports(iface, num_succ_ports); - destroy_iface(iface, IFACE_DESTROY_INIT); +stop: + destroy_iface(iface, IFACE_DESTROY_STOP); return ret; } @@ -2043,15 +1961,13 @@ static int start_network_stage2(void *arg) { struct net_config *net = arg; - int ret; - - ret = start_iface(&net->front, net->num_attempts_link_get); - if (ret < 0) + int ret = start_iface(&net->front, net->num_attempts_link_get); + if (unlikely(ret < 0)) goto fail; if (net->back_iface_enabled) { ret = start_iface(&net->back, net->num_attempts_link_get); - if (ret < 0) + if (unlikely(ret < 0)) goto destroy_front; } @@ -2060,7 +1976,7 @@ start_network_stage2(void *arg) destroy_front: destroy_iface(&net->front, IFACE_DESTROY_STOP); fail: - G_LOG(ERR, "net: failed to start Gatekeeper network\n"); + G_LOG(ERR, "%s(): failed to start Gatekeeper network\n", __func__); return ret; } From 591626ddad39d66f85a98c6f801107f46277802d Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 30 Apr 2024 11:34:16 -0400 Subject: [PATCH 2/9] lib/net: fix LACP bonds with multiple ports This commit implements the workaround discussed in issue #686. --- lib/net.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lib/net.c b/lib/net.c index 7d92f6e3..ecf1477e 100644 --- a/lib/net.c +++ b/lib/net.c @@ -1548,6 +1548,31 @@ create_bond(struct gatekeeper_if *iface) } } + if (__lacp_enabled(iface) && iface->num_ports > 1) { + /* + * XXX #686 Ensure that all members can receive packets + * destined to the MAC address of the bond. + * + * This must come after adding members. Otherwise, + * rte_eth_dev_mac_addr_add() unfortunately does nothing. + */ + struct rte_ether_addr if_macaddr; + ret = rte_eth_macaddr_get(iface->id, &if_macaddr); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot get MAC address (errno=%i): %s\n", + __func__, iface->name, + -ret, rte_strerror(-ret)); + goto close_bond; + } + ret = rte_eth_dev_mac_addr_add(iface->id, &if_macaddr, 0); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot add interface MAC address (errno=%i): %s\n", + __func__, iface->name, + -ret, rte_strerror(-ret)); + goto close_bond; + } + } + return 0; close_bond: From 1d68eacde1bc9528a45947aa1d456a6ee77c1b8b Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 30 Apr 2024 11:36:15 -0400 Subject: [PATCH 3/9] lib/net: improve transmission capacity of bonds By default, bonded interfaces load balance the outgoing packets by hashing only the MAC addresses. This commit changes this hash to include the source and destination addresses of IPv4 and IPv6 headers as well. --- lib/net.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/net.c b/lib/net.c index ecf1477e..833dec7c 100644 --- a/lib/net.c +++ b/lib/net.c @@ -1520,6 +1520,27 @@ create_bond(struct gatekeeper_if *iface) } iface->id = ret; + if (iface->num_ports > 1) { + /* + * The default balancing policy is BALANCE_XMIT_POLICY_LAYER2; + * see bond_alloc() in file + * dependencies/dpdk/drivers/net/bonding/rte_eth_bond_pmd.c + * This mode does not fit Gatekeeper since the next hops for + * Gatekeeper typically are only a couple routers. + * + * Use BALANCE_XMIT_POLICY_LAYER23 instead of + * BALANCE_XMIT_POLICY_LAYER34 to lower the cost per packet. + */ + ret = rte_eth_bond_xmit_policy_set(iface->id, + BALANCE_XMIT_POLICY_LAYER23); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to set transmission policy (errno=%i): %s\n", + __func__, iface->name, + -ret, rte_strerror(-ret)); + goto close_bond; + } + } + if (__lacp_enabled(iface)) { /* * If LACP is enabled, enable multicast addresses. From 50194c53408185f6cc7c884a71d8e656aee3de33 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 30 Apr 2024 13:58:01 -0400 Subject: [PATCH 4/9] lib/net: report the MAC addresses of all interfaces Knowing the MAC addresses of all interfaces is useful to diagnose issues in production; especially issues related to bonded interfaces. Not only are the primary MAC addresses of the interfaces reported, but also the secondary MAC addresses of the interfaces. --- include/gatekeeper_net.h | 2 +- lib/net.c | 119 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/include/gatekeeper_net.h b/include/gatekeeper_net.h index e7b036ac..a1f84b8a 100644 --- a/include/gatekeeper_net.h +++ b/include/gatekeeper_net.h @@ -481,7 +481,7 @@ struct net_config { /* Call lacp_enabled() instead this function wherever possible. */ static inline int -__lacp_enabled(struct gatekeeper_if *iface) +__lacp_enabled(const struct gatekeeper_if *iface) { return iface->bonding_mode == BONDING_MODE_8023AD; } diff --git a/lib/net.c b/lib/net.c index 833dec7c..dc349054 100644 --- a/lib/net.c +++ b/lib/net.c @@ -489,7 +489,7 @@ configure_queue(const struct gatekeeper_if *iface, uint16_t queue_id, } static inline int -iface_bonded(struct gatekeeper_if *iface) +iface_bonded(const struct gatekeeper_if *iface) { return iface->num_ports > 1 || iface->bonding_mode == BONDING_MODE_8023AD; @@ -1605,6 +1605,43 @@ create_bond(struct gatekeeper_if *iface) return ret; } +#define MAX_LOG_IF_NAME (IF_NAMESIZE + 16) +static void +log_if_name(char *if_name, size_t len, const struct gatekeeper_if *iface, + uint16_t port_id) +{ + if (unlikely(len == 0)) { + G_LOG(CRIT, "%s(%s/%u): bug: len == 0\n", + __func__, iface->name, port_id); + return; + } + + if (iface->id != port_id) { + int ret = snprintf(if_name, len, "%s/%u", iface->name, port_id); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s/%u): snprintf() failed (errno=%i): %s\n", + __func__, iface->name, port_id, + -ret, strerror(-ret)); + /* Fall back to interface name. */ + } else if (unlikely((typeof(len))ret >= len)) { + G_LOG(CRIT, "%s(%s/%u): bug: len = %lu <= %i\n", + __func__, iface->name, port_id, len, ret); + /* Fall back to interface name. */ + } else { + /* Success. */ + return; + } + } + + strncpy(if_name, iface->name, len); + if (unlikely(if_name[len - 1] != '\0')) { + G_LOG(CRIT, "%s(%s/%u): bug: len = %lu < strlen(iface->name) = %lu\n", + __func__, iface->name, port_id, + len, strlen(iface->name)); + if_name[len - 1] = '\0'; + } +} + static int init_iface(struct gatekeeper_if *iface) { @@ -1878,6 +1915,85 @@ check_if_rss_key_update(const struct gatekeeper_if *iface) return 0; } +static uint32_t +count_macs(const char *if_name, const struct rte_ether_addr *macaddrs, + uint32_t max_mac_addrs) +{ + bool ended = false; + uint32_t i, count = 0; + + for (i = 0; i < max_mac_addrs; i++) { + if (rte_is_zero_ether_addr(&macaddrs[i])) { + ended = true; + continue; + } + if (unlikely(ended)) { + G_LOG(ERR, "%s(%s): MAC " RTE_ETHER_ADDR_PRT_FMT + " at index %" PRIu32 + " comes after the last index; count = %" + PRIu32 "\n", + __func__, if_name, + RTE_ETHER_ADDR_BYTES(&macaddrs[i]), i, count); + break; + } + count++; + } + + return count; +} + +static void +report_macs(const char *if_name, uint16_t port_id, uint32_t max_mac_addrs) +{ + struct rte_ether_addr macaddrs[max_mac_addrs]; + uint32_t i, count; + int ret = rte_eth_macaddrs_get(port_id, macaddrs, max_mac_addrs); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot get MAC addresses (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + return; + } + + count = count_macs(if_name, macaddrs, max_mac_addrs); + + for (i = 0; i < count; i++) { + G_LOG(INFO, "%s(%s): MAC [%u/%u] " RTE_ETHER_ADDR_PRT_FMT "\n", + __func__, if_name, i + 1, count, + RTE_ETHER_ADDR_BYTES(&macaddrs[i])); + } +} + +static void +report_port_macs(const struct gatekeeper_if *iface, uint16_t port_id) +{ + char if_name[MAX_LOG_IF_NAME]; + struct rte_eth_dev_info dev_info; + int ret; + + log_if_name(if_name, sizeof(if_name), iface, port_id); + + ret = rte_eth_dev_info_get(port_id, &dev_info); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot obtain interface information (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + return; + } + + report_macs(if_name, port_id, dev_info.max_mac_addrs); +} + +static void +report_if_macs(const struct gatekeeper_if *iface) +{ + unsigned int i; + + if (iface_bonded(iface)) + report_port_macs(iface, iface->id); + + for (i = 0; i < iface->num_ports; i++) + report_port_macs(iface, iface->ports[i]); +} + static int start_iface(struct gatekeeper_if *iface, unsigned int num_attempts_link_get) { @@ -1916,6 +2032,7 @@ start_iface(struct gatekeeper_if *iface, unsigned int num_attempts_link_get) if (ipv6_if_configured(iface)) setup_ipv6_addrs(iface); + report_if_macs(iface); return 0; stop: From 71c2c8708364c48e85e7cd6e7d56c204f05d78ab Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 30 Apr 2024 14:40:11 -0400 Subject: [PATCH 5/9] lib/net: do not wait for links to come up Waiting for links to come up is no longer helpful in the current version of DPDK. --- include/gatekeeper_net.h | 6 ---- lib/net.c | 68 ++++-------------------------------- lua/gatekeeper/staticlib.lua | 1 - lua/net.lua | 2 -- 4 files changed, 6 insertions(+), 71 deletions(-) diff --git a/include/gatekeeper_net.h b/include/gatekeeper_net.h index a1f84b8a..b48ab4ba 100644 --- a/include/gatekeeper_net.h +++ b/include/gatekeeper_net.h @@ -430,12 +430,6 @@ struct net_config { */ int back_iface_enabled; - /* - * Number of attempts to wait for Gatekeeper links to - * come up during initialization. - */ - unsigned int num_attempts_link_get; - /* * The NUMA nodes used in the host. Element i is true * if NUMA node i is being used; otherwise it is false. diff --git a/lib/net.c b/lib/net.c index dc349054..2974e5ba 100644 --- a/lib/net.c +++ b/lib/net.c @@ -1723,64 +1723,6 @@ init_iface(struct gatekeeper_if *iface) return ret; } -static int -start_port(uint8_t port_id, uint8_t *pnum_succ_ports, - unsigned int num_attempts_link_get) -{ - struct rte_eth_link link; - uint8_t attempts = 0; - - /* Start device. */ - int ret = rte_eth_dev_start(port_id); - if (ret < 0) { - G_LOG(ERR, "net: failed to start port %hhu (err=%d)\n", - port_id, ret); - return ret; - } - if (pnum_succ_ports != NULL) - (*pnum_succ_ports)++; - - /* - * The following code ensures that the device is ready for - * full speed RX/TX. - * - * When the initialization is done without this, - * the initial packet transmission may be blocked. - * - * Optionally, we can wait for the link to come up before - * continuing. This is useful for bonded ports where the - * slaves must be activated after starting the bonded - * device in order for the link to come up. The slaves - * are activated on a timer, so this can take some time. - */ - do { - ret = rte_eth_link_get(port_id, &link); - if (ret < 0) { - G_LOG(ERR, "net: querying port %hhu failed with err - %s\n", - port_id, rte_strerror(-ret)); - return ret; - } - RTE_VERIFY(ret == 0); - - /* Link is up. */ - if (link.link_status) - break; - - G_LOG(ERR, "net: querying port %hhu, and link is down\n", - port_id); - - if (attempts > num_attempts_link_get) { - G_LOG(ERR, "net: giving up on port %hhu\n", port_id); - return -1; - } - - attempts++; - sleep(1); - } while (true); - - return 0; -} - static inline void gen_ipv6_link_local(struct gatekeeper_if *iface) { @@ -1995,10 +1937,12 @@ report_if_macs(const struct gatekeeper_if *iface) } static int -start_iface(struct gatekeeper_if *iface, unsigned int num_attempts_link_get) +start_iface(struct gatekeeper_if *iface) { - int ret = start_port(iface->id, NULL, num_attempts_link_get); + int ret = rte_eth_dev_start(iface->id); if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): failed to start interface (errno=%i): %s\n", + __func__, iface->name, -ret, rte_strerror(-ret)); destroy_iface(iface, IFACE_DESTROY_INIT); return ret; } @@ -2124,12 +2068,12 @@ static int start_network_stage2(void *arg) { struct net_config *net = arg; - int ret = start_iface(&net->front, net->num_attempts_link_get); + int ret = start_iface(&net->front); if (unlikely(ret < 0)) goto fail; if (net->back_iface_enabled) { - ret = start_iface(&net->back, net->num_attempts_link_get); + ret = start_iface(&net->back); if (unlikely(ret < 0)) goto destroy_front; } diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua index 11e1eb4f..68a56006 100644 --- a/lua/gatekeeper/staticlib.lua +++ b/lua/gatekeeper/staticlib.lua @@ -290,7 +290,6 @@ struct gatekeeper_if { struct net_config { int back_iface_enabled; - unsigned int num_attempts_link_get; bool *numa_used; uint32_t log_level; uint32_t rotate_log_interval_sec; diff --git a/lua/net.lua b/lua/net.lua index f6b47813..3153d38d 100644 --- a/lua/net.lua +++ b/lua/net.lua @@ -37,7 +37,6 @@ return function (gatekeeper_server) -- These variables are unlikely to need to be changed. local guarantee_random_entropy = false - local num_attempts_link_get = 5 local front_ipv6_default_hop_limits = 255 local back_ipv6_default_hop_limits = 255 local rotate_log_interval_sec = 60 * 60 -- (1 hour) @@ -55,7 +54,6 @@ return function (gatekeeper_server) -- local net_conf = staticlib.c.get_net_conf() - net_conf.num_attempts_link_get = num_attempts_link_get net_conf.log_level = log_level net_conf.rotate_log_interval_sec = rotate_log_interval_sec From c1e7a786d9502c7a602c85246d069ca8660c43cf Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 7 May 2024 11:09:31 -0400 Subject: [PATCH 6/9] lib/net: monitor state of the links Having log entries reporting changes in the state of the links eases diagnoses in production. This commit adds these log entries through the Link State Change (LSC) interuption that most NICs support. The information that a given NIC does not support the LSC interuption is logged, but its link state is not monitored. Since the callbacks of interuptions run in threads that are not logical cores, this commit introduces the macro MAIN_LOG() to Gatekeeper's log library to be used in these contexts. --- include/gatekeeper_log_ratelimit.h | 36 +++- include/gatekeeper_main.h | 27 ++- lib/log_ratelimit.c | 54 +++--- lib/net.c | 260 ++++++++++++++++++++++++++++- 4 files changed, 333 insertions(+), 44 deletions(-) diff --git a/include/gatekeeper_log_ratelimit.h b/include/gatekeeper_log_ratelimit.h index 14ba1393..96cb1d0c 100644 --- a/include/gatekeeper_log_ratelimit.h +++ b/include/gatekeeper_log_ratelimit.h @@ -74,7 +74,17 @@ void log_ratelimit_state_init(unsigned int lcore_id, uint32_t interval, * - 0: Success. * - Negative on error. */ -int rte_log_ratelimit(uint32_t level, uint32_t logtype, const char *format, ...) +int gatekeeper_log_ratelimit(uint32_t level, uint32_t logtype, + const char *format, ...) +#ifdef __GNUC__ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) + __attribute__((cold)) +#endif +#endif + __attribute__((format(printf, 3, 4))); + +int gatekeeper_log_main(uint32_t level, uint32_t logtype, + const char *format, ...) #ifdef __GNUC__ #if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 2)) __attribute__((cold)) @@ -86,9 +96,6 @@ int rte_log_ratelimit(uint32_t level, uint32_t logtype, const char *format, ...) int set_log_level_per_block(const char *block_name, uint32_t log_level); int set_log_level_per_lcore(unsigned int lcore_id, uint32_t log_level); -/* Get the block name for the corresponding lcore. */ -const char *get_block_name(unsigned int lcore_id); - struct log_ratelimit_state { uint64_t interval_cycles; uint32_t burst; @@ -97,12 +104,27 @@ struct log_ratelimit_state { uint64_t end; rte_atomic32_t log_level; char block_name[16]; - char str_date_time[32]; - uint64_t update_time_at; } __rte_cache_aligned; -/* Only use these variables in file lib/log_ratelimit.c and in macro G_LOG(). */ +struct log_thread_time { + char str_date_time[32]; + uint64_t update_time_at; +}; + +/* + * Only use these variables in file lib/log_ratelimit.c and in macros + * G_LOG() and MAIN_LOG(). + */ +RTE_DECLARE_PER_LCORE(struct log_thread_time, _log_thread_time); extern struct log_ratelimit_state log_ratelimit_states[RTE_MAX_LCORE]; extern bool log_ratelimit_enabled; +/* Get the block name for the corresponding lcore. */ +static inline const char *get_block_name(unsigned int lcore_id) +{ + return lcore_id < RTE_MAX_LCORE + ? log_ratelimit_states[lcore_id].block_name + : "NO-lcore"; +} + #endif /* _GATEKEEPER_LOG_RATELIMIT_H_ */ diff --git a/include/gatekeeper_main.h b/include/gatekeeper_main.h index 1cc1baed..69aa322f 100644 --- a/include/gatekeeper_main.h +++ b/include/gatekeeper_main.h @@ -35,25 +35,38 @@ #define BLOCK_LOGTYPE RTE_LOGTYPE_USER1 #define G_LOG_PREFIX "%s/%u %s %s " +#define G_LOG_MAIN "Main" #define G_LOG(level, fmt, ...) \ do { \ unsigned int __g_log_lcore_id = rte_lcore_id(); \ - const struct log_ratelimit_state *__g_log_lrs = \ - &log_ratelimit_states[__g_log_lcore_id]; \ - rte_log_ratelimit(RTE_LOG_ ## level, BLOCK_LOGTYPE, \ - G_LOG_PREFIX fmt, \ + gatekeeper_log_ratelimit(RTE_LOG_ ## level, \ + BLOCK_LOGTYPE, G_LOG_PREFIX fmt, \ likely(log_ratelimit_enabled) \ - ? __g_log_lrs->block_name \ - : "Main", \ + ? log_ratelimit_states[__g_log_lcore_id]\ + .block_name \ + : G_LOG_MAIN, \ __g_log_lcore_id, \ - __g_log_lrs->str_date_time, \ + RTE_PER_LCORE(_log_thread_time).str_date_time, \ #level \ __VA_OPT__(,) __VA_ARGS__); \ } while (0) #define G_LOG_CHECK(level) check_log_allowed(RTE_LOG_ ## level) +/* + * This macro should only be called in contexts other than logical cores + * because it is independent of functional blocks and is not rate limited. + * + * From logical cores, call G_LOG(). + */ +#define MAIN_LOG(level, fmt, ...) \ + gatekeeper_log_main(RTE_LOG_ ## level, BLOCK_LOGTYPE, \ + G_LOG_PREFIX fmt, G_LOG_MAIN, rte_gettid(), \ + RTE_PER_LCORE(_log_thread_time).str_date_time, \ + #level \ + __VA_OPT__(,) __VA_ARGS__) + extern volatile int exiting; #define ONE_SEC_IN_NANO_SEC (1000000000L) diff --git a/lib/log_ratelimit.c b/lib/log_ratelimit.c index 53a06577..fff5f9a9 100644 --- a/lib/log_ratelimit.c +++ b/lib/log_ratelimit.c @@ -27,8 +27,8 @@ #include "gatekeeper_main.h" #include "gatekeeper_log_ratelimit.h" +RTE_DEFINE_PER_LCORE(struct log_thread_time, _log_thread_time); struct log_ratelimit_state log_ratelimit_states[RTE_MAX_LCORE]; - bool log_ratelimit_enabled; void @@ -47,16 +47,17 @@ check_log_allowed(uint32_t level) #define NO_TIME_STR "NO TIME" static void -update_str_date_time(struct log_ratelimit_state *lrs, uint64_t now) +update_str_date_time(uint64_t now) { + struct log_thread_time *ttime = &RTE_PER_LCORE(_log_thread_time); struct timespec tp; struct tm *p_tm, time_info; uint64_t diff_ns; int ret; - RTE_BUILD_BUG_ON(sizeof(NO_TIME_STR) > sizeof(lrs->str_date_time)); + RTE_BUILD_BUG_ON(sizeof(NO_TIME_STR) > sizeof(ttime->str_date_time)); - if (likely(now < lrs->update_time_at)) { + if (likely(now < ttime->update_time_at)) { /* Fast path, that is, high log rate. */ return; } @@ -73,7 +74,7 @@ update_str_date_time(struct log_ratelimit_state *lrs, uint64_t now) if (unlikely(p_tm != &time_info)) goto no_time; - ret = strftime(lrs->str_date_time, sizeof(lrs->str_date_time), + ret = strftime(ttime->str_date_time, sizeof(ttime->str_date_time), "%Y-%m-%d %H:%M:%S", &time_info); if (unlikely(ret == 0)) goto no_time; @@ -83,12 +84,13 @@ update_str_date_time(struct log_ratelimit_state *lrs, uint64_t now) no_tp: tp.tv_nsec = 0; no_time: - strcpy(lrs->str_date_time, NO_TIME_STR); + strcpy(ttime->str_date_time, NO_TIME_STR); next_update: diff_ns = likely(tp.tv_nsec >= 0 && tp.tv_nsec < ONE_SEC_IN_NANO_SEC) ? (ONE_SEC_IN_NANO_SEC - tp.tv_nsec) : ONE_SEC_IN_NANO_SEC; /* C library bug! */ - lrs->update_time_at = now + (typeof(now))round(diff_ns * cycles_per_ns); + ttime->update_time_at = + now + (typeof(now))round(diff_ns * cycles_per_ns); } static void @@ -96,10 +98,11 @@ log_ratelimit_reset(struct log_ratelimit_state *lrs, uint64_t now) { lrs->printed = 0; if (lrs->suppressed > 0) { - update_str_date_time(lrs, now); + update_str_date_time(now); rte_log(RTE_LOG_NOTICE, BLOCK_LOGTYPE, G_LOG_PREFIX "%u log entries were suppressed during the last ratelimit interval\n", - lrs->block_name, rte_lcore_id(), lrs->str_date_time, + lrs->block_name, rte_lcore_id(), + RTE_PER_LCORE(_log_thread_time).str_date_time, "NOTICE", lrs->suppressed); } lrs->suppressed = 0; @@ -111,7 +114,6 @@ log_ratelimit_state_init(unsigned int lcore_id, uint32_t interval, uint32_t burst, uint32_t log_level, const char *block_name) { struct log_ratelimit_state *lrs; - uint64_t now; RTE_VERIFY(lcore_id < RTE_MAX_LCORE); @@ -124,10 +126,7 @@ log_ratelimit_state_init(unsigned int lcore_id, uint32_t interval, lrs->suppressed = 0; rte_atomic32_set(&lrs->log_level, log_level); strcpy(lrs->block_name, block_name); - lrs->str_date_time[0] = '\0'; - now = rte_rdtsc(); - lrs->update_time_at = now; - log_ratelimit_reset(lrs, now); + log_ratelimit_reset(lrs, rte_rdtsc()); } /* @@ -162,7 +161,8 @@ log_ratelimit_allow(struct log_ratelimit_state *lrs, uint64_t now) } int -rte_log_ratelimit(uint32_t level, uint32_t logtype, const char *format, ...) +gatekeeper_log_ratelimit(uint32_t level, uint32_t logtype, + const char *format, ...) { uint64_t now = rte_rdtsc(); /* Freeze current time. */ struct log_ratelimit_state *lrs = &log_ratelimit_states[rte_lcore_id()]; @@ -183,7 +183,20 @@ rte_log_ratelimit(uint32_t level, uint32_t logtype, const char *format, ...) return 0; log: - update_str_date_time(lrs, now); + update_str_date_time(now); + va_start(ap, format); + ret = rte_vlog(level, logtype, format, ap); + va_end(ap); + return ret; +} + +int +gatekeeper_log_main(uint32_t level, uint32_t logtype, const char *format, ...) +{ + va_list ap; + int ret; + + update_str_date_time(rte_rdtsc()); va_start(ap, format); ret = rte_vlog(level, logtype, format, ap); va_end(ap); @@ -214,12 +227,3 @@ set_log_level_per_lcore(unsigned int lcore_id, uint32_t log_level) rte_atomic32_set(&log_ratelimit_states[lcore_id].log_level, log_level); return 0; } - -const char * -get_block_name(unsigned int lcore_id) -{ - if (lcore_id >= RTE_MAX_LCORE) { - return "invalid lcore"; - } - return log_ratelimit_states[lcore_id].block_name; -} diff --git a/lib/net.c b/lib/net.c index 2974e5ba..ee5037b2 100644 --- a/lib/net.c +++ b/lib/net.c @@ -532,12 +532,44 @@ get_queue_id(struct gatekeeper_if *iface, enum queue_type ty, return queues[lcore]; } +/* + * Guarantee that rte_eth_dev_callback_register() and + * rte_eth_dev_callback_unregister() are called with the exact same parameters. + */ + +static int +lsc_event_callback(uint16_t port_id, enum rte_eth_event_type event, + void *cb_arg, void *ret_param); + +static inline int +register_callback_for_lsc(struct gatekeeper_if *iface, uint16_t port_id) +{ + return rte_eth_dev_callback_register(port_id, RTE_ETH_EVENT_INTR_LSC, + lsc_event_callback, iface); +} + +static inline int +unregister_callback_for_lsc(struct gatekeeper_if *iface, uint16_t port_id) +{ + return rte_eth_dev_callback_unregister(port_id, RTE_ETH_EVENT_INTR_LSC, + lsc_event_callback, iface); +} + static void -close_iface_ports(struct gatekeeper_if *iface, uint8_t nb_ports) +close_iface_ports(struct gatekeeper_if *iface) { uint8_t i; - for (i = 0; i < nb_ports; i++) - rte_eth_dev_close(iface->ports[i]); + for (i = 0; i < iface->num_ports; i++) { + uint16_t port_id = iface->ports[i]; + + /* + * It's safe to unregister a callback that hasn't been + * registered before. + */ + unregister_callback_for_lsc(iface, port_id); + + rte_eth_dev_close(port_id); + } } enum iface_destroy_cmd { @@ -603,13 +635,21 @@ destroy_iface(struct gatekeeper_if *iface, enum iface_destroy_cmd cmd) /* Stop and close bonded port, if needed. */ if (iface_bonded(iface)) { char if_name[IF_NAMESIZE]; - int ret = bonded_if_name(if_name, iface); + int ret; + + /* + * It's safe to unregister a callback that hasn't been + * registered before. + */ + unregister_callback_for_lsc(iface, iface->id); + + ret = bonded_if_name(if_name, iface); if (likely(ret == 0)) rte_eth_bond_free(if_name); } /* Close and free interface ports. */ - close_iface_ports(iface, iface->num_ports); + close_iface_ports(iface); rte_free(iface->ports); iface->ports = NULL; /* FALLTHROUGH */ @@ -1322,6 +1362,20 @@ check_if_checksums(struct gatekeeper_if *iface, return 0; } +static int +check_if_interruption(struct gatekeeper_if *iface, + const struct rte_eth_dev_info *dev_info, struct rte_eth_conf *port_conf) +{ + RTE_SET_USED(iface); + /* + * Do not log the fact that an interface does not support the LSC + * interruption since this is already logged in monitor_port(). + */ + port_conf->intr_conf.lsc = + !!(*dev_info->dev_flags & RTE_ETH_DEV_INTR_LSC); + return 0; +} + static int check_if_offloads(struct gatekeeper_if *iface, struct rte_eth_conf *port_conf) { @@ -1345,6 +1399,10 @@ check_if_offloads(struct gatekeeper_if *iface, struct rte_eth_conf *port_conf) if (unlikely(ret < 0)) return ret; + ret = check_if_interruption(iface, &dev_info, port_conf); + if (unlikely(ret < 0)) + return ret; + return 0; } @@ -1642,6 +1700,191 @@ log_if_name(char *if_name, size_t len, const struct gatekeeper_if *iface, } } +/* + * ATTENTION: This function is called in the interrupt host thread, + * which is not associated to an lcore, therefore it must call MAIN_LOG() + * instead of G_LOG(). + */ +static void +get_str_members(char *str_members, size_t size, uint16_t *members, + uint16_t count) +{ + unsigned int i, total = 0; + + if (unlikely(size <= 0)) + return; + str_members[0] = '\0'; + + for (i = 0; i < count; i++) { + size_t remainder = size - total; + int ret = snprintf(str_members + total, remainder, + "%u%s", members[i], i + 1 < count ? ", " : ""); + if (unlikely(ret < 0)) { + MAIN_LOG(ERR, "%s(): snprintf() failed (errno=%i): %s\n", + __func__, errno, strerror(errno)); + return; + } + total += ret; + if (unlikely((size_t)ret >= remainder)) { + MAIN_LOG(CRIT, "%s(): bug: str_members' size must be more than %u bytes\n", + __func__, total + 1 /* Accounting for '\0'. */); + str_members[size - 1] = '\0'; + return; + } + } +} + +#define STR_ERROR_MEMBERS "ERROR" +/* + * ATTENTION: This function is called in the interrupt host thread, + * which is not associated to an lcore, therefore it must call MAIN_LOG() + * instead of G_LOG(). + */ +static int +lsc_event_callback(uint16_t port_id, enum rte_eth_event_type event, + void *cb_arg, void *ret_param) +{ + const struct gatekeeper_if *iface = cb_arg; + char if_name[MAX_LOG_IF_NAME]; + struct rte_eth_link link; + char link_status_text[RTE_ETH_LINK_MAX_STR_LEN]; + int ret; + + RTE_SET_USED(ret_param); + + log_if_name(if_name, sizeof(if_name), iface, port_id); + + if (unlikely(event != RTE_ETH_EVENT_INTR_LSC)) { + MAIN_LOG(CRIT, "%s(%s): bug: unexpected event %i\n", + __func__, if_name, event); + return -EFAULT; + } + + ret = rte_eth_link_get_nowait(port_id, &link); + if (unlikely(ret < 0)) { + MAIN_LOG(ERR, "%s(%s): cannot get link status (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + return ret; + } + + ret = rte_eth_link_to_str(link_status_text, sizeof(link_status_text), + &link); + if (unlikely(ret < 0)) { + MAIN_LOG(ERR, "%s(%s): cannot get status string (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + return ret; + } + + if (iface_bonded(iface) && iface->id == port_id) { + uint16_t members[RTE_MAX_ETHPORTS]; + char str_members[16 * RTE_DIM(members)]; + ret = rte_eth_bond_active_members_get(iface->id, members, + RTE_DIM(members)); + if (unlikely(ret < 0)) { + RTE_BUILD_BUG_ON(sizeof(STR_ERROR_MEMBERS) > + sizeof(str_members)); + MAIN_LOG(ERR, "%s(%s): cannot get active members (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + strcpy(str_members, STR_ERROR_MEMBERS); + } else { + get_str_members(str_members, sizeof(str_members), + members, ret); + } + MAIN_LOG(NOTICE, "%s(%s): active members: %s; %s\n", + __func__, if_name, str_members, link_status_text); + return 0; + } + + MAIN_LOG(NOTICE, "%s(%s): %s\n", __func__, if_name, link_status_text); + + if (iface_bonded(iface)) { + /* + * A member's link changed, but the link of the bond may not + * change. Thus, force an "event" for the bonded interface. + * + * The following call works because this callback is added + * _after_ the bonded interface is created, so + * the bonded interface receives this event _before_ + * this callback and updates its state. + */ + lsc_event_callback(iface->id, event, cb_arg, NULL); + } + + return 0; +} + +/* + * RETURN + * false @port_id does not support the LSC interruption. + * No monitoring is added. + * true @port_id is being monitored from now on. + * <0 An errror happened. + */ +static int +monitor_port(struct gatekeeper_if *iface, uint16_t port_id) +{ + char if_name[MAX_LOG_IF_NAME]; + struct rte_eth_dev_info dev_info; + int ret; + + log_if_name(if_name, sizeof(if_name), iface, port_id); + + ret = rte_eth_dev_info_get(port_id, &dev_info); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot obtain interface information (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + return ret; + } + + if (unlikely(!(*dev_info.dev_flags & RTE_ETH_DEV_INTR_LSC))) { + G_LOG(WARNING, "%s(%s): no support for LSC interruption\n", + __func__, if_name); + return false; + } + + ret = register_callback_for_lsc(iface, port_id); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot register LSC callback (errno=%i): %s\n", + __func__, if_name, -ret, rte_strerror(-ret)); + return ret; + } + + return true; +} + +static int +monitor_links(struct gatekeeper_if *iface) +{ + bool monitoring_all_members = true; + unsigned int i; + int ret; + + for (i = 0; i < iface->num_ports; i++) { + ret = monitor_port(iface, iface->ports[i]); + if (unlikely(ret < 0)) + goto error; + monitoring_all_members = monitoring_all_members && ret; + } + + /* + * There is no need to monitor a bonded interface when all of + * its members are already being monitored because each member reports + * on the bonded interface. + */ + if (iface_bonded(iface) && !monitoring_all_members) { + ret = monitor_port(iface, iface->id); + if (unlikely(ret < 0)) + goto error; + } + + return 0; + +error: + while (i > 0) + unregister_callback_for_lsc(iface, iface->ports[--i]); + return ret; +} + static int init_iface(struct gatekeeper_if *iface) { @@ -1711,6 +1954,13 @@ init_iface(struct gatekeeper_if *iface) goto close_ports; } + ret = monitor_links(iface); + if (unlikely(ret < 0)) { + G_LOG(ERR, "%s(%s): cannot monitor links (errno=%i): %s\n", + __func__, iface->name, -ret, strerror(-ret)); + goto close_ports; + } + return 0; close_ports: From 12244dd8e0496a50b948e1b85bc517b3000acfe7 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Wed, 8 May 2024 11:30:12 -0400 Subject: [PATCH 7/9] sol: avoid segmentation fault at boot time If req_queue_init() fails before _all_ request queues are initiated, sol_stage2() calls cleanup_sol(), which, in turn, triggers a segmentation fault. This commit solves this problem by adding list_initiated() to include/list.h, and using it to test if the request queues are initiated in cleanup_sol() before freeing the associated resources. --- include/list.h | 6 ++++++ sol/main.c | 28 ++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/list.h b/include/list.h index 8db3095c..d948fa9c 100644 --- a/include/list.h +++ b/include/list.h @@ -41,6 +41,12 @@ INIT_LIST_HEAD(struct list_head *list) list->prev = list; } +static inline int +list_initiated(const struct list_head *head) +{ + return head->next != NULL && head->prev != NULL; +} + /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. diff --git a/sol/main.c b/sol/main.c index 39fd14a2..97734465 100644 --- a/sol/main.c +++ b/sol/main.c @@ -507,23 +507,31 @@ cleanup_sol(struct sol_config *sol_conf) goto free_sol_conf; for (i = 0; i < sol_conf->num_lcores; i++) { - struct req_queue *req_queue = &sol_conf->instances[i].req_queue; + struct sol_instance *inst = &sol_conf->instances[i]; + struct req_queue *req_queue = &inst->req_queue; struct rte_mbuf *entry, *next; - list_for_each_entry_safe_m(entry, next, &req_queue->head) { - list_del(mbuf_to_list(entry)); - rte_pktmbuf_free(entry); - req_queue->len--; + if (list_initiated(&req_queue->head)) { + list_for_each_entry_safe_m(entry, next, + &req_queue->head) { + list_del(mbuf_to_list(entry)); + rte_pktmbuf_free(entry); + req_queue->len--; + } + + if (unlikely(req_queue->len > 0)) { + G_LOG(CRIT, "%s(): bug: removing all requests from the priority queue on cleanup leaves the queue length at %"PRIu32" at lcore %u\n", + __func__, req_queue->len, + sol_conf->lcores[i]); + } } - if (req_queue->len > 0) - G_LOG(NOTICE, "Bug: removing all requests from the priority queue on cleanup leaves the queue length at %"PRIu32" at lcore %u\n", - req_queue->len, sol_conf->lcores[i]); - - rte_ring_free(sol_conf->instances[i].ring); + rte_ring_free(inst->ring); + inst->ring = NULL; } rte_free(sol_conf->instances); + sol_conf->instances = NULL; free_sol_conf: rte_free(sol_conf); From ce3864d0867d1215fda5555d09610742b7decbf7 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Mon, 13 May 2024 18:50:00 -0400 Subject: [PATCH 8/9] sol: require parameter req_channel_bw_mbps The original configuration of SOL blocks assumes that the bandwidth of the back interface of a Gatekeeper server is representative of the bandwidth of the destination network. This assumption does not hold in production deployments of Gatekeeper. For example, Gatekeeper servers typically operate in networks with higher speeds (e.g., 40Gbps) than the protected destination (e.g., 10Gbps). This commit introduces the parameter destination_bw_gbps to address this reality. --- include/gatekeeper_sol.h | 7 -- lua/gatekeeper/staticlib.lua | 1 - lua/sol.lua | 5 +- sol/main.c | 137 ++++++++++------------------------- 4 files changed, 42 insertions(+), 108 deletions(-) diff --git a/include/gatekeeper_sol.h b/include/gatekeeper_sol.h index 16454b8e..95479daf 100644 --- a/include/gatekeeper_sol.h +++ b/include/gatekeeper_sol.h @@ -102,13 +102,6 @@ struct sol_config { /* Maximum number of requests to store in priority queue at once. */ unsigned int pri_req_max_len; - /* - * Bandwidth limit for the priority queue of requests, - * as a percentage of the capacity of the link. Must - * be > 0 and < 1. - */ - double req_bw_rate; - /* Maximum request enqueue/dequeue size. */ unsigned int enq_burst_size; unsigned int deq_burst_size; diff --git a/lua/gatekeeper/staticlib.lua b/lua/gatekeeper/staticlib.lua index 68a56006..73dd3101 100644 --- a/lua/gatekeeper/staticlib.lua +++ b/lua/gatekeeper/staticlib.lua @@ -414,7 +414,6 @@ struct dynamic_config { struct sol_config { unsigned int pri_req_max_len; - double req_bw_rate; unsigned int enq_burst_size; unsigned int deq_burst_size; double tb_rate_approx_err; diff --git a/lua/sol.lua b/lua/sol.lua index 6a766341..453c19c2 100644 --- a/lua/sol.lua +++ b/lua/sol.lua @@ -6,6 +6,7 @@ return function (net_conf, sol_lcores) -- These parameters should likely be initially changed. local log_level = staticlib.c.RTE_LOG_DEBUG + local destination_bw_gbps = 1 -- XXX #155 These parameters should only be changed for performance reasons. local log_ratelimit_interval_ms = 5000 @@ -17,7 +18,6 @@ return function (net_conf, sol_lcores) -- These variables are unlikely to need to be changed. local tb_rate_approx_err = 1e-7 - local req_channel_bw_mbps = 0.0 -- -- End configuration of SOL block. @@ -35,12 +35,11 @@ return function (net_conf, sol_lcores) sol_conf.log_ratelimit_interval_ms = log_ratelimit_interval_ms sol_conf.log_ratelimit_burst = log_ratelimit_burst sol_conf.pri_req_max_len = pri_req_max_len - sol_conf.req_bw_rate = req_bw_rate sol_conf.enq_burst_size = enq_burst_size sol_conf.deq_burst_size = deq_burst_size sol_conf.tb_rate_approx_err = tb_rate_approx_err - sol_conf.req_channel_bw_mbps = req_channel_bw_mbps + sol_conf.req_channel_bw_mbps = destination_bw_gbps * 1000 * req_bw_rate local ret = staticlib.c.run_sol(net_conf, sol_conf) if ret < 0 then diff --git a/sol/main.c b/sol/main.c index 97734465..88f41e10 100644 --- a/sol/main.c +++ b/sol/main.c @@ -367,46 +367,6 @@ mbits_to_bytes(double mbps) return mbps * (1000 * 1000 / 8); } -/* - * Retrieve the link speed of a Gatekeeper interface. If it - * is a bonded interface, the link speeds are summed. - */ -static int -iface_speed_bytes(struct gatekeeper_if *iface, uint64_t *link_speed_bytes) -{ - uint64_t link_speed_mbits = 0; - uint8_t i; - int ret; - - RTE_VERIFY(link_speed_bytes != NULL); - - for (i = 0; i < iface->num_ports; i++) { - struct rte_eth_link link; - ret = rte_eth_link_get(iface->ports[i], &link); - if (ret < 0) { - G_LOG(ERR, "net: querying port %hhu failed with err - %s\n", - iface->ports[i], rte_strerror(-ret)); - goto err; - } - - if (link.link_speed == RTE_ETH_SPEED_NUM_NONE || - link.link_speed == RTE_ETH_SPEED_NUM_UNKNOWN) { - ret = -ENOTSUP; - goto err; - } - - link_speed_mbits += link.link_speed; - } - - /* Convert to bytes per second. */ - *link_speed_bytes = mbits_to_bytes(link_speed_mbits); - return 0; - -err: - *link_speed_bytes = 0; - return ret; -} - /* * @sol_conf is allocated using rte_calloc_socket(), so initializations * to 0 are not strictly necessary in this function. @@ -414,7 +374,6 @@ iface_speed_bytes(struct gatekeeper_if *iface, uint64_t *link_speed_bytes) static int req_queue_init(struct sol_config *sol_conf) { - uint64_t link_speed_bytes; double max_credit_bytes_precise; double cycles_per_byte_precise; uint64_t cycles_per_byte_floor; @@ -422,24 +381,9 @@ req_queue_init(struct sol_config *sol_conf) uint32_t a, b; int ret, i; - /* Find link speed in bytes, even for a bonded interface. */ - ret = iface_speed_bytes(&sol_conf->net->back, &link_speed_bytes); - if (ret == 0) { - G_LOG(NOTICE, - "Back interface link speed: %"PRIu64" bytes per second\n", - link_speed_bytes); - /* Keep max number of bytes a float for later calculations. */ - max_credit_bytes_precise = - sol_conf->req_bw_rate * link_speed_bytes; - } else { - G_LOG(NOTICE, "Back interface link speed: undefined\n"); - if (sol_conf->req_channel_bw_mbps <= 0) { - G_LOG(ERR, "When link speed on back interface is undefined, parameter req_channel_bw_mbps must be calculated and defined\n"); - return -1; - } - max_credit_bytes_precise = - mbits_to_bytes(sol_conf->req_channel_bw_mbps); - } + /* Convert to bytes per second. */ + max_credit_bytes_precise = + mbits_to_bytes(sol_conf->req_channel_bw_mbps); max_credit_bytes_precise /= sol_conf->num_lcores; @@ -459,15 +403,16 @@ req_queue_init(struct sol_config *sol_conf) cycles_per_byte_precise - cycles_per_byte_floor, sol_conf->tb_rate_approx_err, &a, &b); if (ret < 0) { - G_LOG(ERR, "Could not approximate the request queue's allocated bandwidth\n"); + G_LOG(ERR, "%s(): could not approximate the request queue's allocated bandwidth\n", + __func__); return ret; } /* Add integer number of cycles per byte to numerator. */ a += cycles_per_byte_floor * b; - G_LOG(NOTICE, "Cycles per byte (%f) represented as a rational: %u / %u\n", - cycles_per_byte_precise, a, b); + G_LOG(NOTICE, "%s(): cycles per byte (%f) represented as a rational: %u / %u\n", + __func__, cycles_per_byte_precise, a, b); now = rte_rdtsc(); @@ -665,8 +610,10 @@ run_sol(struct net_config *net_conf, struct sol_config *sol_conf) int ret, i; uint16_t front_inc; - if (net_conf == NULL || sol_conf == NULL) { - ret = -1; + if (unlikely(net_conf == NULL || sol_conf == NULL)) { + G_LOG(ERR, "%s(): net_conf = %p or sol_conf = %p cannot be NULL\n", + __func__, net_conf, sol_conf); + ret = -EINVAL; goto out; } @@ -677,51 +624,51 @@ run_sol(struct net_config *net_conf, struct sol_config *sol_conf) sol_conf->log_level, "SOL"); } - if (!net_conf->back_iface_enabled) { - G_LOG(ERR, "Back interface is required\n"); - ret = -1; + if (unlikely(!net_conf->back_iface_enabled)) { + G_LOG(ERR, "%s(): back interface is required\n", __func__); + ret = -EINVAL; goto out; } - if (sol_conf->pri_req_max_len == 0) { - G_LOG(ERR, - "Priority queue max len must be greater than 0\n"); - ret = -1; + if (unlikely(sol_conf->pri_req_max_len == 0)) { + G_LOG(ERR, "%s(): the parameter pri_req_max_len = %u must be greater than 0\n", + __func__, sol_conf->pri_req_max_len); + ret = -EINVAL; goto out; } - if (sol_conf->enq_burst_size == 0 || sol_conf->deq_burst_size == 0) { - G_LOG(ERR, "Priority queue enqueue and dequeue sizes must both be greater than 0\n"); - ret = -1; + if (unlikely(sol_conf->enq_burst_size == 0 || + sol_conf->deq_burst_size == 0)) { + G_LOG(ERR, "%s(): the paramters enq_burst_size = %u and deq_burst_size = %u must both be greater than 0\n", + __func__, sol_conf->enq_burst_size, + sol_conf->deq_burst_size); + ret = -EINVAL; goto out; } - if (sol_conf->deq_burst_size > sol_conf->pri_req_max_len || - sol_conf->enq_burst_size > sol_conf->pri_req_max_len) { - G_LOG(ERR, "Request queue enqueue and dequeue sizes must be less than the max length of the request queue\n"); - ret = -1; + if (unlikely(sol_conf->enq_burst_size > sol_conf->pri_req_max_len || + sol_conf->deq_burst_size > sol_conf->pri_req_max_len)) { + G_LOG(ERR, "%s(): the paramters enq_burst_size = %u and deq_burst_size = %u must both be less than or equal to the parameter pri_req_max_len = %u\n", + __func__, sol_conf->enq_burst_size, + sol_conf->deq_burst_size, sol_conf->pri_req_max_len); + ret = -EINVAL; goto out; } - if (sol_conf->req_bw_rate <= 0 || sol_conf->req_bw_rate >= 1) { - G_LOG(ERR, - "Request queue bandwidth must be in range (0, 1), but it has been specified as %f\n", - sol_conf->req_bw_rate); - ret = -1; + if (unlikely(sol_conf->req_channel_bw_mbps <= 0)) { + G_LOG(ERR, "%s(): the parameter req_channel_bw_mbps = %f must be greater than 0\n", + __func__, sol_conf->req_channel_bw_mbps); + ret = -EINVAL; goto out; } - if (sol_conf->req_channel_bw_mbps < 0) { - G_LOG(ERR, - "Request channel bandwidth in Mbps must be greater than 0 when the NIC doesn't supply guaranteed bandwidth, but is %f\n", - sol_conf->req_channel_bw_mbps); - ret = -1; + if (unlikely(sol_conf->num_lcores <= 0)) { + G_LOG(ERR, "%s(): the parameter num_lcores = %i must be greater than 0\n", + __func__, sol_conf->num_lcores); + ret = -EINVAL; goto out; } - if (sol_conf->num_lcores <= 0) - goto success; - /* * Need to account for the packets in the following scenarios: * @@ -758,8 +705,8 @@ run_sol(struct net_config *net_conf, struct sol_config *sol_conf) } sol_conf->net = net_conf; - - goto success; + rte_atomic32_init(&sol_conf->ref_cnt); + return 0; stage2: pop_n_at_stage2(1); @@ -769,10 +716,6 @@ run_sol(struct net_config *net_conf, struct sol_config *sol_conf) net_conf->front.total_pkt_burst -= front_inc; out: return ret; - -success: - rte_atomic32_init(&sol_conf->ref_cnt); - return 0; } /* From df1d2431bcebba7566497e5d3a1aa819c60453d3 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 21 May 2024 16:13:20 -0400 Subject: [PATCH 9/9] lib/net: review the size of mbuf pools This commit fixes two miscalculations in the size of mbuf pools: 1. calculate_mempool_config_para() was not accounting for multiple members in bonded interfaces; and 2. Although unlikely, a GK or GT instance might receive all the packets, so the instances must account for this worst case. --- gk/main.c | 8 ++++++-- gt/main.c | 8 ++++++-- lib/net.c | 43 ++++++++++++++++++++++++++++++------------- 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/gk/main.c b/gk/main.c index a176187b..eadd3598 100644 --- a/gk/main.c +++ b/gk/main.c @@ -2541,9 +2541,13 @@ gk_stage1(void *arg) num_mbuf = calculate_mempool_config_para("gk", gk_conf->net, gk_conf->front_max_pkt_burst + gk_conf->back_max_pkt_burst + + /* + * One cannot divide the sum below per gk_conf->num_lcores + * because, though unlikely, it might happen that + * all packets go to a single instance. + */ (gk_conf->net->front.total_pkt_burst + - gk_conf->net->back.total_pkt_burst + gk_conf->num_lcores - 1) / - gk_conf->num_lcores); + gk_conf->net->back.total_pkt_burst)); sol_conf = gk_conf->sol_conf; for (i = 0; i < gk_conf->num_lcores; i++) { diff --git a/gt/main.c b/gt/main.c index cceb7d14..335a1198 100644 --- a/gt/main.c +++ b/gt/main.c @@ -2067,8 +2067,12 @@ init_gt_instances(struct gt_config *gt_conf) gt_conf->net, gt_conf->max_pkt_burst + gt_conf->frag_max_entries + gt_conf->max_pkt_burst + gt_conf->max_ggu_notify_pkts + - (gt_conf->net->front.total_pkt_burst + - gt_conf->num_lcores - 1) / gt_conf->num_lcores); + /* + * One cannot divide the sum below per gt_conf->num_lcores + * because, though unlikely, it might happen that + * all packets go to a single instance. + */ + gt_conf->net->front.total_pkt_burst); /* Set up queue identifiers now for RSS, before instances start. */ for (i = 0; i < gt_conf->num_lcores; i++) { diff --git a/lib/net.c b/lib/net.c index ee5037b2..eeaa9bd4 100644 --- a/lib/net.c +++ b/lib/net.c @@ -2234,25 +2234,42 @@ start_iface(struct gatekeeper_if *iface) return ret; } +static inline uint32_t +if_rx_desc(const struct gatekeeper_if *iface) +{ + return (uint32_t)iface->num_ports * iface->num_rx_desc; +} + +static inline uint32_t +if_tx_desc(const struct gatekeeper_if *iface) +{ + return (uint32_t)iface->num_ports * iface->num_tx_desc; +} + unsigned int calculate_mempool_config_para(const char *block_name, struct net_config *net_conf, unsigned int total_pkt_burst) { - unsigned int num_mbuf; - /* - * The total number of receive descriptors to - * allocate per lcore for the receive ring over all interfaces. + * The total number of receive descriptors to allocate per lcore + * for the receive ring of the front interface. */ - uint16_t total_rx_desc = net_conf->front.num_rx_desc + - (net_conf->back_iface_enabled ? net_conf->back.num_rx_desc : 0); + uint32_t total_rx_desc = if_rx_desc(&net_conf->front); /* - * The total number of transmit descriptors to - * allocate per lcore for the transmit ring over all interfaces. + * The total number of transmit descriptors to allocate per lcore + * for the transmit ring of the front interface. */ - uint16_t total_tx_desc = net_conf->front.num_tx_desc + - (net_conf->back_iface_enabled ? net_conf->back.num_tx_desc : 0); + uint32_t total_tx_desc = if_tx_desc(&net_conf->front); + + uint32_t max_num_pkt; + unsigned int num_mbuf; + + if (net_conf->back_iface_enabled) { + /* Account for the back interface. */ + total_rx_desc += if_rx_desc(&net_conf->back); + total_tx_desc += if_tx_desc(&net_conf->back); + } /* * The number of elements in the mbuf pool. @@ -2261,7 +2278,7 @@ calculate_mempool_config_para(const char *block_name, * It's the number of RX descriptors, the number of TX descriptors, * and the number of packet burst buffers. */ - uint32_t max_num_pkt = total_rx_desc + total_tx_desc + total_pkt_burst; + max_num_pkt = total_rx_desc + total_tx_desc + total_pkt_burst; /* * The optimum size (in terms of memory usage) for a mempool is when @@ -2269,8 +2286,8 @@ calculate_mempool_config_para(const char *block_name, */ num_mbuf = rte_align32pow2(max_num_pkt) - 1; - G_LOG(NOTICE, "%s: %s: total_pkt_burst = %hu packets, total_rx_desc = %hu descriptors, total_tx_desc = %hu descriptors, max_num_pkt = %u packets, num_mbuf = %u packets.\n", - block_name, __func__, total_pkt_burst, total_rx_desc, + G_LOG(NOTICE, "%s(%s): total_pkt_burst = %u packets, total_rx_desc = %u descriptors, total_tx_desc = %u descriptors, max_num_pkt = %u packets, num_mbuf = %u packets\n", + __func__, block_name, total_pkt_burst, total_rx_desc, total_tx_desc, max_num_pkt, num_mbuf); return num_mbuf;