Skip to content

Commit

Permalink
Add antreaProxy.disableServiceHealthCheckServer config (#6939)
Browse files Browse the repository at this point in the history
Running both kube-proxy and Antrea Proxy with `proxyAll` can trigger
some error logs for Services of type LoadBalancer with
`externalTrafficPolicy` set to `Local`. For such Services, the proxy is
in charge of running a health check server on each Node, in order to
report the number of local Endpoints which implement the Service.
Because both kube-proxy and Antrea Proxy (with `proxyAll`) will try to
run the health check servers on the same ports, one of them will fail to
bind to the desired address. In practice, we typically observe that
Antrea Proxy tries to bind to the address first (and succeeds), while
kube-proxy fails and logs an error message, but there is no guarantee
that it will be the case.

To avoid kube-proxy error logs, users can now set
`antreaProxy.disableServiceHealthCheckServer` to true in the Antrea
Agent's config, which will instruct Antrea Proxy to stop running health
check servers for LoadBalancer Services. This is not a perfect solution
as ideally the component responsible for the proxy implementation
(Antrea Proxy in this case) should also be responsible for providing
health check information.

Signed-off-by: Antonin Bas <[email protected]>
  • Loading branch information
antoninbas authored Jan 23, 2025
1 parent 201395b commit 3a46081
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 23 deletions.
1 change: 1 addition & 0 deletions build/charts/antrea/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Kubernetes: `>= 1.19.0-0`
| agent.updateStrategy | object | `{"type":"RollingUpdate"}` | Update strategy for the antrea-agent DaemonSet. |
| agentImage | object | `{"pullPolicy":"IfNotPresent","repository":"antrea/antrea-agent-ubuntu","tag":""}` | Container image to use for the antrea-agent component. |
| antreaProxy.defaultLoadBalancerMode | string | `"nat"` | Determines how external traffic is processed when it's load balanced across Nodes by default. It must be one of "nat" or "dsr". |
| antreaProxy.disableServiceHealthCheckServer | bool | `false` | Disables the health check server run by Antrea Proxy, which provides health information about Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed. |
| antreaProxy.enable | bool | `true` | To disable AntreaProxy, set this to false. |
| antreaProxy.nodePortAddresses | list | `[]` | String array of values which specifies the host IPv4/IPv6 addresses for NodePort. By default, all host addresses are used. |
| antreaProxy.proxyAll | bool | `false` | Proxy all Service traffic, for all Service types, regardless of where it comes from. |
Expand Down
5 changes: 5 additions & 0 deletions build/charts/antrea/conf/antrea-agent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ antreaProxy:
# can reply to clients directly, bypassing the ingress Node.
# A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
defaultLoadBalancerMode: {{ .defaultLoadBalancerMode | quote }}
# Disables the health check server run by Antrea Proxy, which provides health information about
# Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is
# enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to
# bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: {{ .disableServiceHealthCheckServer }}
{{- end }}

# IPsec tunnel related configurations.
Expand Down
6 changes: 6 additions & 0 deletions build/charts/antrea/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ antreaProxy:
# -- Determines how external traffic is processed when it's load balanced
# across Nodes by default. It must be one of "nat" or "dsr".
defaultLoadBalancerMode: "nat"
# -- Disables the health check server run by Antrea Proxy, which provides health
# information about Services of type LoadBalancer with externalTrafficPolicy set to
# Local, when proxyAll is enabled. This avoids race conditions between kube-proxy
# and Antrea proxy, with both trying to bind to the same addresses, when proxyAll
# is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: false

nodeIPAM:
# -- Enable Node IPAM in Antrea
Expand Down
9 changes: 7 additions & 2 deletions build/yamls/antrea-aks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4345,6 +4345,11 @@ data:
# can reply to clients directly, bypassing the ingress Node.
# A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
defaultLoadBalancerMode: "nat"
# Disables the health check server run by Antrea Proxy, which provides health information about
# Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is
# enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to
# bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: false
# IPsec tunnel related configurations.
ipsec:
Expand Down Expand Up @@ -5421,7 +5426,7 @@ spec:
kubectl.kubernetes.io/default-container: antrea-agent
# Automatically restart Pods with a RollingUpdate if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: f7ac1903ae9edfd45361cb67b991cb23f708f15cb5cb862bffd70e95dcd776fb
checksum/config: e9ed628a60f731498979612c9d28080dc89b4f54b1dcbb5e86fce29df7c482f1
labels:
app: antrea
component: antrea-agent
Expand Down Expand Up @@ -5665,7 +5670,7 @@ spec:
annotations:
# Automatically restart Pod if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: f7ac1903ae9edfd45361cb67b991cb23f708f15cb5cb862bffd70e95dcd776fb
checksum/config: e9ed628a60f731498979612c9d28080dc89b4f54b1dcbb5e86fce29df7c482f1
labels:
app: antrea
component: antrea-controller
Expand Down
9 changes: 7 additions & 2 deletions build/yamls/antrea-eks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4345,6 +4345,11 @@ data:
# can reply to clients directly, bypassing the ingress Node.
# A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
defaultLoadBalancerMode: "nat"
# Disables the health check server run by Antrea Proxy, which provides health information about
# Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is
# enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to
# bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: false
# IPsec tunnel related configurations.
ipsec:
Expand Down Expand Up @@ -5421,7 +5426,7 @@ spec:
kubectl.kubernetes.io/default-container: antrea-agent
# Automatically restart Pods with a RollingUpdate if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: f7ac1903ae9edfd45361cb67b991cb23f708f15cb5cb862bffd70e95dcd776fb
checksum/config: e9ed628a60f731498979612c9d28080dc89b4f54b1dcbb5e86fce29df7c482f1
labels:
app: antrea
component: antrea-agent
Expand Down Expand Up @@ -5666,7 +5671,7 @@ spec:
annotations:
# Automatically restart Pod if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: f7ac1903ae9edfd45361cb67b991cb23f708f15cb5cb862bffd70e95dcd776fb
checksum/config: e9ed628a60f731498979612c9d28080dc89b4f54b1dcbb5e86fce29df7c482f1
labels:
app: antrea
component: antrea-controller
Expand Down
9 changes: 7 additions & 2 deletions build/yamls/antrea-gke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4345,6 +4345,11 @@ data:
# can reply to clients directly, bypassing the ingress Node.
# A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
defaultLoadBalancerMode: "nat"
# Disables the health check server run by Antrea Proxy, which provides health information about
# Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is
# enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to
# bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: false
# IPsec tunnel related configurations.
ipsec:
Expand Down Expand Up @@ -5421,7 +5426,7 @@ spec:
kubectl.kubernetes.io/default-container: antrea-agent
# Automatically restart Pods with a RollingUpdate if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: 00ba3a60f132691721ba2e84c5c8f0a9eddc32593b38798de8f59d52fff54169
checksum/config: adf1e0f238974d7f83bd321a403f1613ae7e695f06b5366cee645a39141872db
labels:
app: antrea
component: antrea-agent
Expand Down Expand Up @@ -5663,7 +5668,7 @@ spec:
annotations:
# Automatically restart Pod if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: 00ba3a60f132691721ba2e84c5c8f0a9eddc32593b38798de8f59d52fff54169
checksum/config: adf1e0f238974d7f83bd321a403f1613ae7e695f06b5366cee645a39141872db
labels:
app: antrea
component: antrea-controller
Expand Down
9 changes: 7 additions & 2 deletions build/yamls/antrea-ipsec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4358,6 +4358,11 @@ data:
# can reply to clients directly, bypassing the ingress Node.
# A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
defaultLoadBalancerMode: "nat"
# Disables the health check server run by Antrea Proxy, which provides health information about
# Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is
# enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to
# bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: false
# IPsec tunnel related configurations.
ipsec:
Expand Down Expand Up @@ -5434,7 +5439,7 @@ spec:
kubectl.kubernetes.io/default-container: antrea-agent
# Automatically restart Pods with a RollingUpdate if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: 4b9bbfbbda1ab405ade14e797ea88fbd6f3795bb6aae9df0496409d542799145
checksum/config: 9b14e08a59181e975a2326f4ef4a7c55a1640027bda93ad0ee09fe2ef18b7491
checksum/ipsec-secret: d0eb9c52d0cd4311b6d252a951126bf9bea27ec05590bed8a394f0f792dcb2a4
labels:
app: antrea
Expand Down Expand Up @@ -5722,7 +5727,7 @@ spec:
annotations:
# Automatically restart Pod if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: 4b9bbfbbda1ab405ade14e797ea88fbd6f3795bb6aae9df0496409d542799145
checksum/config: 9b14e08a59181e975a2326f4ef4a7c55a1640027bda93ad0ee09fe2ef18b7491
labels:
app: antrea
component: antrea-controller
Expand Down
9 changes: 7 additions & 2 deletions build/yamls/antrea.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4345,6 +4345,11 @@ data:
# can reply to clients directly, bypassing the ingress Node.
# A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
defaultLoadBalancerMode: "nat"
# Disables the health check server run by Antrea Proxy, which provides health information about
# Services of type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is
# enabled. This avoids race conditions between kube-proxy and Antrea proxy, with both trying to
# bind to the same addresses, when proxyAll is enabled while kube-proxy has not been removed.
disableServiceHealthCheckServer: false
# IPsec tunnel related configurations.
ipsec:
Expand Down Expand Up @@ -5421,7 +5426,7 @@ spec:
kubectl.kubernetes.io/default-container: antrea-agent
# Automatically restart Pods with a RollingUpdate if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: e4e94ba89524d8fdc7eb3ad6e0f6948767f3d92ef767f17c47da348f08b5c2e0
checksum/config: afc566f7a719f6dd3ff30e3b495df2e4f5991e5a8d0696f891dc9c77ce795e2f
labels:
app: antrea
component: antrea-agent
Expand Down Expand Up @@ -5663,7 +5668,7 @@ spec:
annotations:
# Automatically restart Pod if the ConfigMap changes
# See https://helm.sh/docs/howto/charts_tips_and_tricks/#automatically-roll-deployments
checksum/config: e4e94ba89524d8fdc7eb3ad6e0f6948767f3d92ef767f17c47da348f08b5c2e0
checksum/config: afc566f7a719f6dd3ff30e3b495df2e4f5991e5a8d0696f891dc9c77ce795e2f
labels:
app: antrea
component: antrea-controller
Expand Down
26 changes: 26 additions & 0 deletions docs/antrea-proxy.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,32 @@ prioritizing the rules installed by Antrea Proxy over those installed by
kube-proxy, thus it works only with kube-proxy iptables mode. Support for other
kube-proxy modes may be added in the future.

Note that running both kube-proxy and Antrea Proxy with `proxyAll` can trigger
some error logs for Services of type LoadBalancer with `externalTrafficPolicy`
set to `Local`. For such Services, the proxy is in charge of running a health
check server on each Node, in order to report the number of local Endpoints
which implement the Service. The server port is determined by the value of
[`.spec.healthCheckNodePort`](https://kubernetes.io/docs/tasks/access-application-cluster/create-external-load-balancer/#preserving-the-client-source-ip).
Because both kube-proxy and Antrea Proxy (with `proxyAll`) will try to run the
health check servers on the same ports, one of them will fail to bind to the
desired address. In practice, we typically observe that Antrea Proxy tries to
bind to the address first (and succeeds), while kube-proxy fails and logs the
following error message:

```text
E0117 19:38:17.586328 1 service_health.go:145] "Failed to start healthcheck" err="listen tcp 0.0.0.0:31653: bind: address already in use" node="kind-worker" service="default/nginx" port=31653
```

These log messages will keep repeating periodically, as kube-proxy handles
Service updates. While the messages are harmless, they can create a lot of
unnecessary noise. You may want to set `antreaProxy.disableServiceHealthCheckServer: true`
in the `antrea-config` ConfigMap to avoid such logs. It will instruct Antrea Proxy
to stop running health check servers and shift this responsibility to
kube-proxy. This is not a perfect solution as ideally the component responsible
for the proxy implementation (Antrea Proxy) should also be responsible for
providing health check information. We still recommend removing kube-proxy
whenever possible.

### Removing kube-proxy

In this section, we will provide steps to run a K8s cluster without kube-proxy,
Expand Down
42 changes: 30 additions & 12 deletions pkg/agent/proxy/proxier.go
Original file line number Diff line number Diff line change
Expand Up @@ -1353,7 +1353,9 @@ func newProxier(
proxyLoadBalancerIPs bool,
defaultLoadBalancerMode agentconfig.LoadBalancerMode,
groupCounter types.GroupCounter,
supportNestedService bool) (*proxier, error) {
supportNestedService bool,
serviceHealthServerDisabled bool,
) (*proxier, error) {
recorder := record.NewBroadcaster().NewRecorder(
runtime.NewScheme(),
corev1.EventSource{Component: componentName, Host: hostname},
Expand Down Expand Up @@ -1381,11 +1383,15 @@ func newProxier(

var serviceHealthServer healthcheck.ServiceHealthServer
if proxyAllEnabled {
nodePortAddressesString := make([]string, len(nodePortAddresses))
for i, address := range nodePortAddresses {
nodePortAddressesString[i] = address.String()
if serviceHealthServerDisabled {
klog.V(2).InfoS("Service health check server will not be run")
} else {
nodePortAddressesString := make([]string, len(nodePortAddresses))
for i, address := range nodePortAddresses {
nodePortAddressesString[i] = address.String()
}
serviceHealthServer = healthcheck.NewServiceHealthServer(hostname, nil, nodePortAddressesString)
}
serviceHealthServer = healthcheck.NewServiceHealthServer(hostname, nil, nodePortAddressesString)
}

// TODO: The label selector nonHeadlessServiceSelector was added to pass the Kubernetes e2e test
Expand Down Expand Up @@ -1496,8 +1502,9 @@ func newDualStackProxier(
defaultLoadBalancerMode agentconfig.LoadBalancerMode,
v4groupCounter types.GroupCounter,
v6groupCounter types.GroupCounter,
nestedServiceSupport bool) (*metaProxierWrapper, error) {

nestedServiceSupport bool,
serviceHealthServerDisabled bool,
) (*metaProxierWrapper, error) {
// Create an IPv4 instance of the single-stack proxier.
ipv4Proxier, err := newProxier(hostname,
serviceProxyName,
Expand All @@ -1516,7 +1523,9 @@ func newDualStackProxier(
proxyLoadBalancerIPs,
defaultLoadBalancerMode,
v4groupCounter,
nestedServiceSupport)
nestedServiceSupport,
serviceHealthServerDisabled,
)
if err != nil {
return nil, fmt.Errorf("error when creating IPv4 proxier: %v", err)
}
Expand All @@ -1538,7 +1547,9 @@ func newDualStackProxier(
proxyLoadBalancerIPs,
defaultLoadBalancerMode,
v6groupCounter,
nestedServiceSupport)
nestedServiceSupport,
serviceHealthServerDisabled,
)
if err != nil {
return nil, fmt.Errorf("error when creating IPv6 proxier: %v", err)
}
Expand Down Expand Up @@ -1571,6 +1582,7 @@ func NewProxier(hostname string,
skipServices := proxyConfig.SkipServices
proxyLoadBalancerIPs := *proxyConfig.ProxyLoadBalancerIPs
serviceProxyName := proxyConfig.ServiceProxyName
serviceHealthServerDisabled := proxyConfig.DisableServiceHealthCheckServer

var proxier Proxier
var err error
Expand All @@ -1594,7 +1606,9 @@ func NewProxier(hostname string,
defaultLoadBalancerMode,
v4GroupCounter,
v6GroupCounter,
nestedServiceSupport)
nestedServiceSupport,
serviceHealthServerDisabled,
)
if err != nil {
return nil, fmt.Errorf("error when creating dual-stack proxier: %v", err)
}
Expand All @@ -1616,7 +1630,9 @@ func NewProxier(hostname string,
proxyLoadBalancerIPs,
defaultLoadBalancerMode,
v4GroupCounter,
nestedServiceSupport)
nestedServiceSupport,
serviceHealthServerDisabled,
)
if err != nil {
return nil, fmt.Errorf("error when creating IPv4 proxier: %v", err)
}
Expand All @@ -1638,7 +1654,9 @@ func NewProxier(hostname string,
proxyLoadBalancerIPs,
defaultLoadBalancerMode,
v6GroupCounter,
nestedServiceSupport)
nestedServiceSupport,
serviceHealthServerDisabled,
)
if err != nil {
return nil, fmt.Errorf("error when creating IPv6 proxier: %v", err)
}
Expand Down
25 changes: 24 additions & 1 deletion pkg/agent/proxy/proxier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,7 @@ type proxyOptions struct {
serviceProxyNameSet bool
cleanupStaleUDPSvcConntrack bool
defaultLoadBalancerMode agentconfig.LoadBalancerMode
serviceHealthServerDisabled bool
}

type proxyOptionsFn func(*proxyOptions)
Expand Down Expand Up @@ -452,6 +453,10 @@ func withCleanupStaleUDPSvcConntrack(o *proxyOptions) {
o.cleanupStaleUDPSvcConntrack = true
}

func withoutServiceHealthServer(o *proxyOptions) {
o.serviceHealthServerDisabled = true
}

func getMockClients(ctrl *gomock.Controller) (*ofmock.MockClient, *routemock.MockInterface) {
mockOFClient := ofmock.NewMockClient(ctrl)
mockRouteClient := routemock.NewMockInterface(ctrl)
Expand Down Expand Up @@ -495,7 +500,10 @@ func newFakeProxier(routeClient route.Interface, ofClient openflow.Client, nodeP
[]string{skippedServiceNN, skippedClusterIP},
o.proxyLoadBalancerIPs,
o.defaultLoadBalancerMode,
types.NewGroupCounter(groupIDAllocator, make(chan string, 100)), o.supportNestedService)
types.NewGroupCounter(groupIDAllocator, make(chan string, 100)),
o.supportNestedService,
o.serviceHealthServerDisabled,
)
p.runner = k8sproxy.NewBoundedFrequencyRunner(componentName, p.syncProxyRules, time.Second, 30*time.Second, 2)
p.endpointsChanges = newEndpointsChangesTracker(hostname, o.endpointSliceEnabled, isIPv6)
p.cleanupStaleUDPSvcConntrack = o.cleanupStaleUDPSvcConntrack
Expand Down Expand Up @@ -3893,3 +3901,18 @@ func TestServiceLabelSelector(t *testing.T) {
assert.Contains(t, fp.serviceInstalledMap, svcPortName1)
})
}

func TestServiceHealthServer(t *testing.T) {
t.Run("proxyAll disabled", func(t *testing.T) {
fp := newFakeProxier(nil, nil, nil, nil, false)
assert.Nil(t, fp.serviceHealthServer)
})
t.Run("enabled", func(t *testing.T) {
fp := newFakeProxier(nil, nil, nil, nil, false, withProxyAll)
assert.NotNil(t, fp.serviceHealthServer)
})
t.Run("force disabled", func(t *testing.T) {
fp := newFakeProxier(nil, nil, nil, nil, false, withProxyAll, withoutServiceHealthServer)
assert.Nil(t, fp.serviceHealthServer)
})
}
5 changes: 5 additions & 0 deletions pkg/config/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,11 @@ type AntreaProxyConfig struct {
// can reply to clients directly, bypassing the ingress Node.
// A Service's load balancer mode can be overridden by annotating it with `service.antrea.io/load-balancer-mode`.
DefaultLoadBalancerMode string `yaml:"defaultLoadBalancerMode,omitempty"`
// Disables the health check server run by Antrea Proxy, which provides health information about Services of
// type LoadBalancer with externalTrafficPolicy set to Local, when proxyAll is enabled. This avoids race
// conditions between kube-proxy and Antrea proxy, with both trying to bind to the same addresses, when proxyAll
// is enabled while kube-proxy has not been removed.
DisableServiceHealthCheckServer bool `yaml:"disableServiceHealthCheckServer,omitempty"`
}

type WireGuardConfig struct {
Expand Down

0 comments on commit 3a46081

Please sign in to comment.