Skip to content

Commit

Permalink
unix: support SO_REUSEPORT with load balancing for UDP (libuv#4419)
Browse files Browse the repository at this point in the history
Signed-off-by: Andy Pan <[email protected]>
  • Loading branch information
panjf2000 authored Jun 20, 2024
1 parent eb5af8e commit ba24986
Show file tree
Hide file tree
Showing 13 changed files with 467 additions and 110 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,7 @@ if(LIBUV_BUILD_TESTS)
test/test-udp-send-unreachable.c
test/test-udp-try-send.c
test/test-udp-recv-in-a-row.c
test/test-udp-reuseport.c
test/test-uname.c
test/test-walk-handles.c
test/test-watcher-cross-stop.c)
Expand Down
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ test_run_tests_SOURCES = test/blackhole-server.c \
test/test-udp-send-unreachable.c \
test/test-udp-try-send.c \
test/test-udp-recv-in-a-row.c \
test/test-udp-reuseport.c \
test/test-uname.c \
test/test-walk-handles.c \
test/test-watcher-cross-stop.c
Expand Down
3 changes: 2 additions & 1 deletion docs/src/tcp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ API
.. note::
``UV_TCP_REUSEPORT`` flag is available only on Linux 3.9+, DragonFlyBSD 3.6+,
FreeBSD 12.0+, Solaris 11.4, and AIX 7.2.5+ at the moment.
FreeBSD 12.0+, Solaris 11.4, and AIX 7.2.5+ at the moment. On other platforms
this function will return an UV_ENOTSUP error.
.. c:function:: int uv_tcp_getsockname(const uv_tcp_t* handle, struct sockaddr* name, int* namelen)
Expand Down
57 changes: 42 additions & 15 deletions docs/src/udp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,21 @@ Data types
/* Disables dual stack mode. */
UV_UDP_IPV6ONLY = 1,
/*
* Indicates message was truncated because read buffer was too small. The
* remainder was discarded by the OS. Used in uv_udp_recv_cb.
*/
* Indicates message was truncated because read buffer was too small. The
* remainder was discarded by the OS. Used in uv_udp_recv_cb.
*/
UV_UDP_PARTIAL = 2,
/*
* Indicates if SO_REUSEADDR will be set when binding the handle in
* uv_udp_bind.
* This sets the SO_REUSEPORT socket flag on the BSDs and OS X. On other
* Unix platforms, it sets the SO_REUSEADDR flag. What that means is that
* multiple threads or processes can bind to the same address without error
* (provided they all set the flag) but only the last one to bind will receive
* any traffic, in effect "stealing" the port from the previous listener.
*/
* Indicates if SO_REUSEADDR will be set when binding the handle.
* This sets the SO_REUSEPORT socket flag on the BSDs (except for
* DragonFlyBSD), OS X, and other platforms where SO_REUSEPORTs don't
* have the capability of load balancing, as the opposite of what
* UV_UDP_REUSEPORT would do. On other Unix platforms, it sets the
* SO_REUSEADDR flag. What that means is that multiple threads or
* processes can bind to the same address without error (provided
* they all set the flag) but only the last one to bind will receive
* any traffic, in effect "stealing" the port from the previous listener.
*/
UV_UDP_REUSEADDR = 4,
/*
* Indicates that the message was received by recvmmsg, so the buffer provided
Expand All @@ -62,8 +64,20 @@ Data types
*/
UV_UDP_LINUX_RECVERR = 32,
/*
* Indicates that recvmmsg should be used, if available.
*/
* Indicates if SO_REUSEPORT will be set when binding the handle.
* This sets the SO_REUSEPORT socket option on supported platforms.
* Unlike UV_UDP_REUSEADDR, this flag will make multiple threads or
* processes that are binding to the same address and port "share"
* the port, which means incoming datagrams are distributed across
* the receiving sockets among threads or processes.
*
* This flag is available only on Linux 3.9+, DragonFlyBSD 3.6+,
* FreeBSD 12.0+, Solaris 11.4, and AIX 7.2.5+ for now.
*/
UV_UDP_REUSEPORT = 64,
/*
* Indicates that recvmmsg should be used, if available.
*/
UV_UDP_RECVMMSG = 256
};

Expand Down Expand Up @@ -186,11 +200,24 @@ API
with the address and port to bind to.
:param flags: Indicate how the socket will be bound,
``UV_UDP_IPV6ONLY``, ``UV_UDP_REUSEADDR``, and ``UV_UDP_RECVERR``
are supported.
``UV_UDP_IPV6ONLY``, ``UV_UDP_REUSEADDR``, ``UV_UDP_REUSEPORT``,
and ``UV_UDP_RECVERR`` are supported.
:returns: 0 on success, or an error code < 0 on failure.
.. versionchanged:: 1.49.0 added the ``UV_UDP_REUSEPORT`` flag.
.. note::
``UV_UDP_REUSEPORT`` flag is available only on Linux 3.9+, DragonFlyBSD 3.6+,
FreeBSD 12.0+, Solaris 11.4, and AIX 7.2.5+ at the moment. On other platforms
this function will return an UV_ENOTSUP error.
For platforms where `SO_REUSEPORT`s have the capability of load balancing,
specifying both ``UV_UDP_REUSEADDR`` and ``UV_UDP_REUSEPORT`` in flags is allowed
and `SO_REUSEPORT` will always override the behavior of `SO_REUSEADDR`.
For platforms where `SO_REUSEPORT`s don't have the capability of load balancing,
specifying both ``UV_UDP_REUSEADDR`` and ``UV_UDP_REUSEPORT`` in flags will fail,
returning an UV_ENOTSUP error.
.. c:function:: int uv_udp_connect(uv_udp_t* handle, const struct sockaddr* addr)
Associate the UDP handle to a remote address and port, so every
Expand Down
23 changes: 19 additions & 4 deletions include/uv.h
Original file line number Diff line number Diff line change
Expand Up @@ -656,10 +656,13 @@ enum uv_udp_flags {
UV_UDP_PARTIAL = 2,
/*
* Indicates if SO_REUSEADDR will be set when binding the handle.
* This sets the SO_REUSEPORT socket flag on the BSDs and OS X. On other
* Unix platforms, it sets the SO_REUSEADDR flag. What that means is that
* multiple threads or processes can bind to the same address without error
* (provided they all set the flag) but only the last one to bind will receive
* This sets the SO_REUSEPORT socket flag on the BSDs (except for
* DragonFlyBSD), OS X, and other platforms where SO_REUSEPORTs don't
* have the capability of load balancing, as the opposite of what
* UV_UDP_REUSEPORT would do. On other Unix platforms, it sets the
* SO_REUSEADDR flag. What that means is that multiple threads or
* processes can bind to the same address without error (provided
* they all set the flag) but only the last one to bind will receive
* any traffic, in effect "stealing" the port from the previous listener.
*/
UV_UDP_REUSEADDR = 4,
Expand All @@ -682,6 +685,18 @@ enum uv_udp_flags {
* This flag is no-op on platforms other than Linux.
*/
UV_UDP_LINUX_RECVERR = 32,
/*
* Indicates if SO_REUSEPORT will be set when binding the handle.
* This sets the SO_REUSEPORT socket option on supported platforms.
* Unlike UV_UDP_REUSEADDR, this flag will make multiple threads or
* processes that are binding to the same address and port "share"
* the port, which means incoming datagrams are distributed across
* the receiving sockets among threads or processes.
*
* This flag is available only on Linux 3.9+, DragonFlyBSD 3.6+,
* FreeBSD 12.0+, Solaris 11.4, and AIX 7.2.5+ for now.
*/
UV_UDP_REUSEPORT = 64,
/*
* Indicates that recvmmsg should be used, if available.
*/
Expand Down
45 changes: 45 additions & 0 deletions src/unix/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1910,3 +1910,48 @@ unsigned int uv_available_parallelism(void) {
return (unsigned) rc;
#endif /* __linux__ */
}

int uv__sock_reuseport(int fd) {
int on = 1;
#if defined(__FreeBSD__) && __FreeBSD__ >= 12 && defined(SO_REUSEPORT_LB)
/* FreeBSD 12 introduced a new socket option named SO_REUSEPORT_LB
* with the capability of load balancing, it's the substitution of
* the SO_REUSEPORTs on Linux and DragonFlyBSD. */
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT_LB, &on, sizeof(on)))
return UV__ERR(errno);
#elif (defined(__linux__) || \
defined(_AIX73) || \
(defined(__DragonFly__) && __DragonFly_version >= 300600) || \
(defined(__sun) && defined(SO_FLOW_NAME))) && \
defined(SO_REUSEPORT)
/* On Linux 3.9+, the SO_REUSEPORT implementation distributes connections
* evenly across all of the threads (or processes) that are blocked in
* accept() on the same port. As with TCP, SO_REUSEPORT distributes datagrams
* evenly across all of the receiving threads (or process).
*
* DragonFlyBSD 3.6.0 extended SO_REUSEPORT to distribute workload to
* available sockets, which made it the equivalent of Linux's SO_REUSEPORT.
*
* AIX 7.2.5 added the feature that would add the capability to distribute
* incoming connections or datagrams across all listening ports for SO_REUSEPORT.
*
* Solaris 11 supported SO_REUSEPORT, but it's implemented only for
* binding to the same address and port, without load balancing.
* Solaris 11.4 extended SO_REUSEPORT with the capability of load balancing.
* Since it's impossible to detect the Solaris 11.4 version via OS macros,
* so we check the presence of the socket option SO_FLOW_NAME that was first
* introduced to Solaris 11.4. */
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)))
return UV__ERR(errno);
#else
(void) (fd);
(void) (on);
/* SO_REUSEPORTs do not have the capability of load balancing on platforms
* other than those mentioned above. The semantics are completely different,
* therefore we shouldn't enable it, but fail this operation to indicate that
* UV_[TCP/UDP]_REUSEPORT is not supported on these platforms. */
return UV_ENOTSUP;
#endif

return 0;
}
1 change: 1 addition & 0 deletions src/unix/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ int uv__close(int fd); /* preserves errno */
int uv__close_nocheckstdio(int fd);
int uv__close_nocancel(int fd);
int uv__socket(int domain, int type, int protocol);
int uv__sock_reuseport(int fd);
ssize_t uv__recvmsg(int fd, struct msghdr *msg, int flags);
void uv__make_close_pending(uv_handle_t* handle);
int uv__getiovmax(void);
Expand Down
46 changes: 1 addition & 45 deletions src/unix/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -148,50 +148,6 @@ int uv_tcp_init(uv_loop_t* loop, uv_tcp_t* tcp) {
}


static int uv__tcp_reuseport(int fd) {
int on = 1;
#if defined(__FreeBSD__) && __FreeBSD__ >= 12 && defined(SO_REUSEPORT_LB)
/* FreeBSD 12 introduced a new socket option named SO_REUSEPORT_LB
* with the capability of load balancing, it's the substitution of
* the SO_REUSEPORTs on Linux and DragonFlyBSD. */
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT_LB, &on, sizeof(on)))
return UV__ERR(errno);
#elif (defined(__linux__) || \
defined(_AIX73) || \
(defined(__DragonFly__) && __DragonFly_version >= 300600) || \
(defined(__sun) && defined(SO_FLOW_NAME))) && \
defined(SO_REUSEPORT)
/* On Linux 3.9+, the SO_REUSEPORT implementation distributes connections
* evenly across all of the threads (or processes) that are blocked in
* accept() on the same port.
*
* DragonFlyBSD 3.6.0 extended SO_REUSEPORT to distribute workload to
* available sockets, which made it the equivalent of Linux's SO_REUSEPORT.
*
* AIX 7.2.5 added the feature that would add the capability to distribute
* incoming connections across all listening ports for SO_REUSEPORT.
*
* Solaris 11 supported SO_REUSEPORT, but it's implemented only for
* binding to the same address and port, without load balancing.
* Solaris 11.4 extended SO_REUSEPORT with the capability of load balancing.
* Since it's impossible to detect the Solaris 11.4 version via OS macros,
* so we check the presence of the socket option SO_FLOW_NAME that was first
* introduced to Solaris 11.4. */
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)))
return UV__ERR(errno);
#else
(void) (fd);
(void) (on);
/* SO_REUSEPORTs do not have the capability of load balancing on platforms
* other than those mentioned above. The semantics are completely different,
* therefore we shouldn't enable it, but fail this operation to indicate that
* UV_TCP_REUSEPORT is not supported on these platforms. */
return UV_ENOTSUP;
#endif

return 0;
}

int uv__tcp_bind(uv_tcp_t* tcp,
const struct sockaddr* addr,
unsigned int addrlen,
Expand All @@ -212,7 +168,7 @@ int uv__tcp_bind(uv_tcp_t* tcp,
return UV__ERR(errno);

if (flags & UV_TCP_REUSEPORT) {
err = uv__tcp_reuseport(tcp->io_watcher.fd);
err = uv__sock_reuseport(tcp->io_watcher.fd);
if (err)
return err;
}
Expand Down
30 changes: 20 additions & 10 deletions src/unix/udp.c
Original file line number Diff line number Diff line change
Expand Up @@ -434,17 +434,20 @@ static void uv__udp_sendmsg(uv_udp_t* handle) {
}

/* On the BSDs, SO_REUSEPORT implies SO_REUSEADDR but with some additional
* refinements for programs that use multicast.
* refinements for programs that use multicast. Therefore we preferentially
* set SO_REUSEPORT over SO_REUSEADDR here, but we set SO_REUSEPORT only
* when that socket option doesn't have the capability of load balancing.
* Otherwise, we fall back to SO_REUSEADDR.
*
* Linux as of 3.9 and DragonflyBSD 3.6 have the SO_REUSEPORT socket option but
* with semantics that are different from the BSDs: it _shares_ the port rather
* than steals it from the current listener. While useful, it's not something we
* can emulate on other platforms so we don't enable it.
* Linux as of 3.9, DragonflyBSD 3.6, AIX 7.2.5 have the SO_REUSEPORT socket
* option but with semantics that are different from the BSDs: it _shares_
* the port rather than steals it from the current listener. While useful,
* it's not something we can emulate on other platforms so we don't enable it.
*
* zOS does not support getsockname with SO_REUSEPORT option when using
* AF_UNIX.
*/
static int uv__set_reuse(int fd) {
static int uv__sock_reuseaddr(int fd) {
int yes;
yes = 1;

Expand All @@ -461,7 +464,7 @@ static int uv__set_reuse(int fd) {
return UV__ERR(errno);
}
#elif defined(SO_REUSEPORT) && !defined(__linux__) && !defined(__GNU__) && \
!defined(__sun__) && !defined(__DragonFly__)
!defined(__sun__) && !defined(__DragonFly__) && !defined(_AIX73)
if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &yes, sizeof(yes)))
return UV__ERR(errno);
#else
Expand Down Expand Up @@ -504,7 +507,8 @@ int uv__udp_bind(uv_udp_t* handle,
int fd;

/* Check for bad flags. */
if (flags & ~(UV_UDP_IPV6ONLY | UV_UDP_REUSEADDR | UV_UDP_LINUX_RECVERR))
if (flags & ~(UV_UDP_IPV6ONLY | UV_UDP_REUSEADDR |
UV_UDP_REUSEPORT | UV_UDP_LINUX_RECVERR))
return UV_EINVAL;

/* Cannot set IPv6-only mode on non-IPv6 socket. */
Expand All @@ -527,7 +531,13 @@ int uv__udp_bind(uv_udp_t* handle,
}

if (flags & UV_UDP_REUSEADDR) {
err = uv__set_reuse(fd);
err = uv__sock_reuseaddr(fd);
if (err)
return err;
}

if (flags & UV_UDP_REUSEPORT) {
err = uv__sock_reuseport(fd);
if (err)
return err;
}
Expand Down Expand Up @@ -1049,7 +1059,7 @@ int uv_udp_open(uv_udp_t* handle, uv_os_sock_t sock) {
if (err)
return err;

err = uv__set_reuse(sock);
err = uv__sock_reuseaddr(sock);
if (err)
return err;

Expand Down
6 changes: 6 additions & 0 deletions src/win/udp.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ static int uv__udp_maybe_bind(uv_udp_t* handle,
if (handle->flags & UV_HANDLE_BOUND)
return 0;

/* There is no SO_REUSEPORT on Windows, Windows only knows SO_REUSEADDR.
* so we just return an error directly when UV_UDP_REUSEPORT is requested
* for binding the socket. */
if (flags & UV_UDP_REUSEPORT)
return ERROR_NOT_SUPPORTED;

if ((flags & UV_UDP_IPV6ONLY) && addr->sa_family != AF_INET6) {
/* UV_UDP_IPV6ONLY is supported only for IPV6 sockets */
return ERROR_INVALID_PARAMETER;
Expand Down
2 changes: 2 additions & 0 deletions test/test-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ TEST_DECLARE (udp_open_twice)
TEST_DECLARE (udp_open_bound)
TEST_DECLARE (udp_open_connect)
TEST_DECLARE (udp_recv_in_a_row)
TEST_DECLARE (udp_reuseport)
#ifndef _WIN32
TEST_DECLARE (udp_send_unix)
#endif
Expand Down Expand Up @@ -804,6 +805,7 @@ TASK_LIST_START
TEST_ENTRY (udp_sendmmsg_error)
TEST_ENTRY (udp_try_send)
TEST_ENTRY (udp_recv_in_a_row)
TEST_ENTRY (udp_reuseport)

TEST_ENTRY (udp_open)
TEST_ENTRY (udp_open_twice)
Expand Down
Loading

0 comments on commit ba24986

Please sign in to comment.