diff --git a/README.md b/README.md index 0a0a212..921b937 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # Network Node Manager -network-node-manager is the kubernetes controller that solves External-IP (Load Balancer IP) issue with IPVS proxy mode. IPVS proxy mode has various problems, and one of them is that the External-IP assigned through the LoadBalancer type service with externalTrafficPolicy=Local option cannot access inside the cluster. More details on this issue can be found at [here](https://github.com/kubernetes/kubernetes/issues/75262). network-node-manager solves this issue. network-node-manager is based on [kubebuilder](https://github.com/kubernetes-sigs/kubebuilder). +network-node-manager is a kubernetes controller that controls the network configuration of a node to resolve network issues of kubernetes. By simply deploying and configuring network-node-manager, you can solve kubernetes network issues that cannot be resolved by kubernetes or resolved by the higher kubernetes Version. Below is a list of kubernetes's issues to be resolved by network-node-manager. network-node-manager is based on [kubebuilder](https://github.com/kubernetes-sigs/kubebuilder). + +* [External-IP access issue with IPVS proxy mode](issues/external_IP_access_issue_IPVS_proxy_mode.md) +* [Connection reset issue between pod and out of cluster](issues/connection_reset_issue_pod_out_cluster.md) ## Deploy @@ -9,55 +12,56 @@ network-node-manager now supports below CPU architectures. * amd64 * arm64 -Deploy network-node-managers through below command. +Deploy network-node-managers through below command according to kube-proxy mode. ``` -kubectl apply -f https://raw.githubusercontent.com/kakao/network-node-manager/master/deploy/network-node-manager.yml +iptables proxy mode : kubectl apply -f https://raw.githubusercontent.com/kakao/network-node-manager/master/deploy/network-node-manager_iptables.yml +IPVS proxy mode : kubectl apply -f https://raw.githubusercontent.com/kakao/network-node-manager/master/deploy/network-node-manager_ipvs.yml ``` ## Configuration -### IPv6 +### Network Stack (IPv4, IPv6) -network-node-manager supports IPv6. However, IPv6 is not enabled by default. To use IPv6, set "NET_STACK" environment in the DaemonSet manifests of network-node-manager as follows. +* Default : "ipv4" +* iptables proxy mode manifest : "ipv4" +* IPVS proxy mode manifest : "ipv4" ``` -... -env: -- name: NET_STACK - value: ipv4,ipv6 -- name: NODE_NAME - valueFrom: -... +IPv4 : kubectl -n kube-system set env daemonset/network-node-manager NET_STACK=ipv4 +IPv6 : kubectl -n kube-system set env daemonset/network-node-manager NET_STACK=ipv6 +IPv4,IPv6 : kubectl -n kube-system set env daemonset/network-node-manager NET_STACK=ipv4,ipv6 ``` -## How it works? +### External-IP to Cluster-IP DNAT Rule -network-node-manager works on all worker nodes and adds the DNAT rules that converts destination IP of a packet from External-IP to Cluster-IP to iptables. network-node-manager adds two DNAT rules for each LoadBalancer type service. One is added to the prerouting chain and the other is added to the output chain. The DNAT rule in the prerouting chain is for the pod that uses pod-only network namespace. On the other hand, The DNAT rule in the output chain is for the pod that uses host network namespace. All DNAT rules only target packets from pods on the host. Below is an example. +* Related issue : [External-IP access issue with IPVS proxy mode](issues/external_IP_access_issue_IPVS_proxy_mode.md) +* Default : false +* iptables proxy mode manifest : false +* IPVS proxy mode manifest : true ``` -$ kubectl -n default get service -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -lb-service-1 LoadBalancer 10.231.42.164 10.19.20.201 80:31751/TCP,443:30126/TCP 16d -lb-service-2 LoadBalancer 10.231.2.62 10.19.22.57 80:32352/TCP,443:31549/TCP 16d - -$ iptables -nvL -t nat -... -Chain NMANAGER_IPVS_LB_OUTPUT (1 references) - pkts bytes target prot opt in out source destination - 0 0 KUBE-MARK-MASQ all -- * * 0.0.0.0/0 10.19.20.201 /* default/lb-service-1 */ ADDRTYPE match src-type LOCAL - 0 0 DNAT all -- * * 0.0.0.0/0 10.19.20.201 /* default/lb-service-1 */ ADDRTYPE match src-type LOCAL to:10.231.42.164 - 2 120 KUBE-MARK-MASQ all -- * * 0.0.0.0/0 10.19.22.57 /* default/lb-service-2 */ ADDRTYPE match src-type LOCAL - 2 120 DNAT all -- * * 0.0.0.0/0 10.19.22.57 /* default/lb-service-2 */ ADDRTYPE match src-type LOCAL to:10.231.2.62 - -Chain NMANAGER_IPVS_LB_PREROUTING (1 references) - pkts bytes target prot opt in out source destination - 0 0 KUBE-MARK-MASQ all -- * * 10.240.2.128/25 10.19.20.201 /* default/lb-service-1 */ - 0 0 DNAT all -- * * 10.240.2.128/25 10.19.20.201 /* default/lb-service-1 */ to:10.231.42.164 - 1 60 KUBE-MARK-MASQ all -- * * 10.240.2.128/25 10.19.22.57 /* default/lb-service-2 */ - 1 60 DNAT all -- * * 10.240.2.128/25 10.19.22.57 /* default/lb-service-2 */ to:10.231.2.62 -... +On : kubectl -n kube-system set env daemonset/network-node-manager RULE_EXTERNAL_CLUSTER=true +Off : kubectl -n kube-system set env daemonset/network-node-manager RULE_EXTERNAL_CLUSTER=false +``` + +### Drop Invalid Packet Rule in INPUT chain + +* Related issue : [Connection reset issue between pod and out of cluster](issues/connection_reset_issue_pod_out_cluster.md) +* Default : true +* iptables proxy mode manifest : true +* IPVS proxy mode manifest : true + ``` +On : kubectl -n kube-system set env daemonset/network-node-manager RULE_DROP_INVALID_INPUT=true +Off : kubectl -n kube-system set env daemonset/network-node-manager RULE_DROP_INVALID_INPUT=false +``` + +## How it works? + +![kpexec Architecture](img/network-node-manager_Architecture.PNG) + +network-node-manager runs on all kubernetes cluster nodes in host network namespace with network privileges and manage the node network configuration. network-node-manager watches the kubernetes object through kubenetes API server like a general kubernetes controller and manage the node network configuration. Now network-node-manager only watches service object. ## License diff --git a/controllers/rule_common.go b/controllers/rule_common.go new file mode 100644 index 0000000..a868815 --- /dev/null +++ b/controllers/rule_common.go @@ -0,0 +1,116 @@ +package controllers + +import ( + "github.com/go-logr/logr" + + "github.com/kakao/network-node-manager/pkg/configs" + "github.com/kakao/network-node-manager/pkg/iptables" +) + +// Constants +const ( + ChainFilterInput = "INPUT" + ChainNATPrerouting = "PREROUTING" + ChainNATOutput = "OUTPUT" + + ChainNATKubeMarkMasq = "KUBE-MARK-MASQ" + + ChainFilterBaseInput = "NMANAGER_INPUT" + ChainNATBasePrerouting = "NMANAGER_PREROUTING" + ChainNATBaseOutput = "NMANAGER_OUTPUT" + + ChainFilterDropInvalidInput = "NMANAGER_DROP_INVALID_INPUT" + ChainNATExternalClusterPrerouting = "NMANAGER_EX_CLUS_PREROUTING" + ChainNATExternalClusterOutput = "NMANAGER_EX_CLUS_OUTPUT" +) + +func initBaseChains(logger logr.Logger) error { + // Get network stack config + configIPv4Enabled, configIPv6Enabled, err := configs.GetConfigNetStack() + if err != nil { + return err + } + + // IPv4 + if configIPv4Enabled { + // Create base chain in tables + out, err := iptables.CreateChainIPv4(iptables.TableFilter, ChainFilterBaseInput) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.CreateChainIPv4(iptables.TableNAT, ChainNATBasePrerouting) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.CreateChainIPv4(iptables.TableNAT, ChainNATBaseOutput) + if err != nil { + logger.Error(err, out) + return err + } + + // Create jump rule to each chain in tables + ruleJumpFilterInput := []string{"-j", ChainFilterBaseInput} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableFilter, ChainFilterInput, "", ruleJumpFilterInput...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpNATPre := []string{"-j", ChainNATBasePrerouting} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableNAT, ChainNATPrerouting, "", ruleJumpNATPre...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpNATOut := []string{"-j", ChainNATBaseOutput} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableNAT, ChainNATOutput, "", ruleJumpNATOut...) + if err != nil { + logger.Error(err, out) + return err + } + + } + + // IPv6 + if configIPv6Enabled { + // Create base chain in nat table + out, err := iptables.CreateChainIPv6(iptables.TableFilter, ChainFilterBaseInput) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.CreateChainIPv6(iptables.TableNAT, ChainNATBasePrerouting) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.CreateChainIPv6(iptables.TableNAT, ChainNATBaseOutput) + if err != nil { + logger.Error(err, out) + return err + } + + // Create jump rule to each chain in nat table + ruleJumpFilterInput := []string{"-j", ChainFilterBaseInput} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableFilter, ChainFilterInput, "", ruleJumpFilterInput...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpNATPre := []string{"-j", ChainNATBasePrerouting} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableNAT, ChainNATPrerouting, "", ruleJumpNATPre...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpNATOut := []string{"-j", ChainNATBaseOutput} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableNAT, ChainNATOutput, "", ruleJumpNATOut...) + if err != nil { + logger.Error(err, out) + return err + } + } + + return nil +} diff --git a/controllers/rule_drop_invalid_input.go b/controllers/rule_drop_invalid_input.go new file mode 100644 index 0000000..d2d4b21 --- /dev/null +++ b/controllers/rule_drop_invalid_input.go @@ -0,0 +1,108 @@ +package controllers + +import ( + "github.com/go-logr/logr" + + "github.com/kakao/network-node-manager/pkg/iptables" +) + +func createRulesDropInvalidInput(logger logr.Logger) error { + if err := initBaseChains(logger); err != nil { + logger.Error(err, "failed to init base chain for externalIP to clusterIP Rules") + return err + } + + // IPv4 + if configIPv4Enabled { + // Create chain + out, err := iptables.CreateChainIPv4(iptables.TableFilter, ChainFilterDropInvalidInput) + if err != nil { + logger.Error(err, out) + return err + } + + // Set drop rule + ruleDrop := []string{"-m", "conntrack", "--ctstate", "INVALID", "-j", "DROP"} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableFilter, ChainFilterDropInvalidInput, "", ruleDrop...) + if err != nil { + logger.Error(err, out) + return err + } + + // Set jump rule + ruleJump := []string{"-j", ChainFilterDropInvalidInput} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableFilter, ChainFilterBaseInput, "", ruleJump...) + if err != nil { + logger.Error(err, out) + return err + } + } + + // IPv6 + if configIPv6Enabled { + // Create chain + out, err := iptables.CreateChainIPv6(iptables.TableFilter, ChainFilterDropInvalidInput) + if err != nil { + logger.Error(err, out) + return err + } + + // Set drop rule + ruleDrop := []string{"-m", "conntrack", "--ctstate", "INVALID", "-j", "DROP"} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableFilter, ChainFilterDropInvalidInput, "", ruleDrop...) + if err != nil { + logger.Error(err, out) + return err + } + + // Set jump rule + ruleJump := []string{"-j", ChainFilterDropInvalidInput} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableFilter, ChainFilterBaseInput, "", ruleJump...) + if err != nil { + logger.Error(err, out) + return err + } + } + + return nil +} + +func deleteRulesDropInvalidInput(logger logr.Logger) error { + // IPv4 + if configIPv4Enabled { + // Delete jump rule + ruleJump := []string{"-j", ChainFilterDropInvalidInput} + out, err := iptables.DeleteRuleIPv4(iptables.TableFilter, ChainFilterBaseInput, "", ruleJump...) + if err != nil { + logger.Error(err, out) + return err + } + + // Delete chain + out, err = iptables.DeleteChainIPv4(iptables.TableFilter, ChainFilterDropInvalidInput) + if err != nil { + logger.Error(err, out) + return err + } + } + + // IPv6 + if configIPv6Enabled { + // Delete jump rule + ruleJump := []string{"-j", ChainFilterDropInvalidInput} + out, err := iptables.DeleteRuleIPv6(iptables.TableFilter, ChainFilterBaseInput, "", ruleJump...) + if err != nil { + logger.Error(err, out) + return err + } + + // Delete chain + out, err = iptables.DeleteChainIPv6(iptables.TableFilter, ChainFilterDropInvalidInput) + if err != nil { + logger.Error(err, out) + return err + } + } + + return nil +} diff --git a/controllers/rule_external_cluster.go b/controllers/rule_external_cluster.go new file mode 100644 index 0000000..2c367f8 --- /dev/null +++ b/controllers/rule_external_cluster.go @@ -0,0 +1,465 @@ +package controllers + +import ( + "errors" + "strings" + + corev1 "k8s.io/api/core/v1" + ctrl "sigs.k8s.io/controller-runtime" + + "github.com/go-logr/logr" + "github.com/kakao/network-node-manager/pkg/ip" + "github.com/kakao/network-node-manager/pkg/iptables" +) + +func initRulesExternalCluster(logger logr.Logger) error { + if err := initBaseChains(logger); err != nil { + logger.Error(err, "failed to init base chain for externalIP to clusterIP Rules") + return err + } + + // IPv4 + if configIPv4Enabled { + // Create chain in nat table + out, err := iptables.CreateChainIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.CreateChainIPv4(iptables.TableNAT, ChainNATExternalClusterOutput) + if err != nil { + logger.Error(err, out) + return err + } + + // Set jump rule to each chain in nat table + ruleJumpPre := []string{"-j", ChainNATExternalClusterPrerouting} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableNAT, ChainNATBasePrerouting, "", ruleJumpPre...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpOut := []string{"-j", ChainNATExternalClusterOutput} + out, err = iptables.CreateRuleFirstIPv4(iptables.TableNAT, ChainNATBaseOutput, "", ruleJumpOut...) + if err != nil { + logger.Error(err, out) + return err + } + } + // IPv6 + if configIPv6Enabled { + // Create chain in nat table + out, err := iptables.CreateChainIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.CreateChainIPv6(iptables.TableNAT, ChainNATExternalClusterOutput) + if err != nil { + logger.Error(err, out) + return err + } + + // Set jump rule to each chain in nat table + ruleJumpPre := []string{"-j", ChainNATExternalClusterPrerouting} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableNAT, ChainNATBasePrerouting, "", ruleJumpPre...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpOut := []string{"-j", ChainNATExternalClusterOutput} + out, err = iptables.CreateRuleFirstIPv6(iptables.TableNAT, ChainNATBaseOutput, "", ruleJumpOut...) + if err != nil { + logger.Error(err, out) + return err + } + } + + return nil +} + +func destoryRulesExternalCluster(logger logr.Logger) error { + // IPv4 + if configIPv4Enabled { + // Delete jump rule to each chain in nat table + ruleJumpPre := []string{"-j", ChainNATExternalClusterPrerouting} + out, err := iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATBasePrerouting, "", ruleJumpPre...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpOut := []string{"-j", ChainNATExternalClusterOutput} + out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATBaseOutput, "", ruleJumpOut...) + if err != nil { + logger.Error(err, out) + return err + } + + // Delete chain in nat table + out, err = iptables.DeleteChainIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.DeleteChainIPv4(iptables.TableNAT, ChainNATExternalClusterOutput) + if err != nil { + logger.Error(err, out) + return err + } + } + // IPv6 + if configIPv6Enabled { + // Delete jump rule to each chain in nat table + ruleJumpPre := []string{"-j", ChainNATExternalClusterPrerouting} + out, err := iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATBasePrerouting, "", ruleJumpPre...) + if err != nil { + logger.Error(err, out) + return err + } + ruleJumpOut := []string{"-j", ChainNATExternalClusterOutput} + out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATBaseOutput, "", ruleJumpOut...) + if err != nil { + logger.Error(err, out) + return err + } + + // Delete chain in nat table + out, err = iptables.DeleteChainIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting) + if err != nil { + logger.Error(err, out) + return err + } + out, err = iptables.DeleteChainIPv6(iptables.TableNAT, ChainNATExternalClusterOutput) + if err != nil { + logger.Error(err, out) + return err + } + } + + return nil +} + +func cleanupRulesExternalCluster(logger logr.Logger, svcs *corev1.ServiceList, podCIDRIPv4, podCIDRIPv6 string) error { + // IPv4 + if configIPv4Enabled { + // Make up service map + svcMap := make(map[string]*corev1.Service) + for _, svc := range svcs.Items { + if ip.IsIPv4Addr(svc.Spec.ClusterIP) && svc.Spec.Type == corev1.ServiceTypeLoadBalancer { + svcMap[svc.Namespace+"/"+svc.Name] = svc.DeepCopy() + } + } + + // Cleanup prerouting chain + preRules, err := iptables.GetRulesIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting) + if err != nil { + return err + } + for _, rule := range preRules { + // Get service info from rule and k8s, and delete iptables rules + nsName, src, dest, jump, dnatDest := getSvcInfoFromRule(rule) + svc, ok := svcMap[nsName] + if !ok { + logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup prerouting chain IPv4 rule") + out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + continue + } + + // Compare service info and delete iptables rules + for _, ingress := range svc.Status.LoadBalancer.Ingress { + externalIP := ingress.IP + "/32" + if (jump == ChainNATKubeMarkMasq && (src == podCIDRIPv4 && dest == externalIP)) || + (jump == "DNAT" && (src == podCIDRIPv4 && dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { + continue + } + logger.WithValues("rule", rule).Info("service info is diff. cleanup prerouting chain IPv4 rule") + out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + } + } + + // Cleanup output chain + outRules, err := iptables.GetRulesIPv4(iptables.TableNAT, ChainNATExternalClusterOutput) + if err != nil { + return err + } + for _, rule := range outRules { + // Get service info from rule and k8s, and delete iptables rules + nsName, _, dest, jump, dnatDest := getSvcInfoFromRule(rule) + svc, ok := svcMap[nsName] + if !ok { + logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup output chain IPv4 rule") + out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + continue + } + + // Compare service info and delete diff iptables rules + for _, ingress := range svc.Status.LoadBalancer.Ingress { + externalIP := ingress.IP + "/32" + if (jump == ChainNATKubeMarkMasq && dest == externalIP) || + (jump == "DNAT" && (dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { + continue + } + logger.WithValues("rule", rule).Info("service info is diff. cleanup output chain IPv4 rule") + out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + } + } + } + // IPv6 + if configIPv6Enabled { + // Make up service map + svcMap := make(map[string]*corev1.Service) + for _, svc := range svcs.Items { + if ip.IsIPv6Addr(svc.Spec.ClusterIP) && svc.Spec.Type == corev1.ServiceTypeLoadBalancer { + svcMap[svc.Namespace+"/"+svc.Name] = svc.DeepCopy() + } + } + + // Cleanup prerouting chain + preRules, err := iptables.GetRulesIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting) + if err != nil { + return err + } + for _, rule := range preRules { + // Get service info from rule and k8s, and delete iptables rules + nsName, src, dest, jump, dnatDest := getSvcInfoFromRule(rule) + svc, ok := svcMap[nsName] + if !ok { + logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup prerouting chain IPv6 rule") + out, err := iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + continue + } + + // Compare service info and delete iptables rules + for _, ingress := range svc.Status.LoadBalancer.Ingress { + externalIP := ingress.IP + "/128" + if (jump == ChainNATKubeMarkMasq && (src == podCIDRIPv6 && dest == externalIP)) || + (jump == "DNAT" && (src == podCIDRIPv6 && dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { + continue + } + logger.WithValues("rule", rule).Info("service info is diff. cleanup prerouting chain IPv6 rule") + out, err := iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + } + } + + // Cleanup output + outRules, err := iptables.GetRulesIPv6(iptables.TableNAT, ChainNATExternalClusterOutput) + if err != nil { + return err + } + for _, rule := range outRules { + // Get service info from rule and k8s, and delete iptables rules + nsName, _, dest, jump, dnatDest := getSvcInfoFromRule(rule) + svc, ok := svcMap[nsName] + if !ok { + logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup output chain IPv6 rule") + out, err := iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + continue + } + + // Compare service info and delete diff iptables rules + for _, ingress := range svc.Status.LoadBalancer.Ingress { + externalIP := ingress.IP + "/128" + if (jump == ChainNATKubeMarkMasq && dest == externalIP) || + (jump == "DNAT" && (dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { + continue + } + logger.WithValues("rule", rule).Info("service info is diff. cleanup output chain IPv6 rule") + out, err := iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) + if err != nil { + logger.Error(err, out) + return err + } + } + } + } + + return nil +} + +func createRulesExternalCluster(logger logr.Logger, req *ctrl.Request, clusterIP, externalIP, podCIDRIPv4, podCIDRIPv6 string) error { + // Don't use spec.ipFamily to distingush between IPv4 and IPv6 Address + // for kubernetes version that dosen't support IPv6 dualstack + if configIPv4Enabled && ip.IsIPv4Addr(clusterIP) { + // IPv4 + // Set prerouting + rulePreMasq := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err := iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreMasq...) + if err != nil { + logger.Error(err, out) + return err + } + rulePreDNAT := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + + // Set output + ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err = iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutMasq...) + if err != nil { + logger.Error(err, out) + return err + } + ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + } else if configIPv6Enabled && ip.IsIPv6Addr(clusterIP) { + // IPv6 + // Set prerouting + rulePreMasq := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err := iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreMasq...) + if err != nil { + logger.Error(err, out) + return err + } + rulePreDNAT := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + + // Set output + ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err = iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutMasq...) + if err != nil { + logger.Error(err, out) + return err + } + ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + } else { + if ip.IsVaildIP(clusterIP) { + logger.WithValues("clusterIP", clusterIP).Error(errors.New("invalid IP"), "invaild IP") + } + } + + return nil +} + +func deleteRulesExternalCluster(logger logr.Logger, req *ctrl.Request, clusterIP, externalIP, podCIDRIPv4, podCIDRIPv6 string) error { + // Don't use spec.ipFamily to distingush between IPv4 and IPv6 Address + // for kubernetes version that dosen't support IPv6 dualstack + if configIPv4Enabled && ip.IsIPv4Addr(clusterIP) { + // IPv4 + // Unset prerouting + rulePreMasq := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err := iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreMasq...) + if err != nil { + logger.Error(err, out) + return err + } + rulePreDNAT := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + + // Unset output + ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutMasq...) + if err != nil { + logger.Error(err, out) + return err + } + ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + } else if configIPv6Enabled && ip.IsIPv6Addr(clusterIP) { + // IPv6 + // Unset prerouting + rulePreMasq := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err := iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreMasq...) + if err != nil { + logger.Error(err, out) + return err + } + rulePreDNAT := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATExternalClusterPrerouting, req.String(), rulePreDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + + // Unset output + ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMarkMasq} + out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutMasq...) + if err != nil { + logger.Error(err, out) + return err + } + ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} + out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATExternalClusterOutput, req.String(), ruleOutDNAT...) + if err != nil { + logger.Error(err, out) + return err + } + } else { + if ip.IsVaildIP(clusterIP) { + logger.WithValues("clusterIP", clusterIP).Error(errors.New("invalid IP"), "invaild IP") + } + } + return nil +} + +func getPodCIDR(cidrs []string) (ipv4CIDR string, ipv6CIDR string) { + for _, cidr := range cidrs { + addr := strings.Split(cidr, "/")[0] + if ip.IsIPv4Addr(addr) { + ipv4CIDR = cidr + } else if ip.IsIPv6Addr(addr) { + ipv6CIDR = cidr + } + } + return +} + +func getSvcInfoFromRule(rule string) (nsName, src, dest, jump, dnatDest string) { + nsName = iptables.GetRuleComment(rule) + src = iptables.GetRuleSrc(rule) + dest = iptables.GetRuleDest(rule) + jump = iptables.GetRuleJump(rule) + dnatDest = iptables.GetRuleDNATDest(rule) + return +} diff --git a/controllers/service_controller.go b/controllers/service_controller.go index 6c4983b..21b0e87 100644 --- a/controllers/service_controller.go +++ b/controllers/service_controller.go @@ -18,9 +18,8 @@ package controllers import ( "context" - "errors" "os" - "strings" + "time" corev1 "k8s.io/api/core/v1" apierror "k8s.io/apimachinery/pkg/api/errors" @@ -31,8 +30,6 @@ import ( "github.com/go-logr/logr" "github.com/kakao/network-node-manager/pkg/configs" - "github.com/kakao/network-node-manager/pkg/ip" - "github.com/kakao/network-node-manager/pkg/iptables" ) // ServiceReconciler reconciles a Service object @@ -42,22 +39,15 @@ type ServiceReconciler struct { Scheme *runtime.Scheme } -// Constants -const ( - ChainNATPrerouting = "PREROUTING" - ChainNATOutput = "OUTPUT" - ChainNATKubeMasquerade = "KUBE-MARK-MASQ" - - ChainNATIPVSLBPrerouting = "NMANAGER_IPVS_LB_PREROUTING" - ChainNATIPVSLBOutput = "NMANAGER_IPVS_LB_OUTPUT" -) - // Variables var ( configNodeName string configIPv4Enabled bool configIPv6Enabled bool + configRuleExternalCluster bool + configRuleDropInvalidInput bool + initFlag = false podCIDRIPv4 string podCIDRIPv6 string @@ -91,478 +81,157 @@ func (r *ServiceReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) { logger.Error(err, "config error") os.Exit(1) } - logger.WithValues("node name", configNodeName).Info("config node name") configIPv4Enabled, configIPv6Enabled, err = configs.GetConfigNetStack() if err != nil { logger.Error(err, "config error") os.Exit(1) } - logger.WithValues("IPv4", configIPv4Enabled).WithValues("IPv6", configIPv6Enabled).Info("config network stack") - - // Get nodes's pod CIDR - node := &corev1.Node{} - if err := r.Client.Get(ctx, types.NamespacedName{Name: configNodeName}, node); err != nil { - logger.Error(err, "failed to get the pod's node info from API server") - os.Exit(1) - } - podCIDRIPv4, podCIDRIPv6 = getPodCIDR(node.Spec.PodCIDRs) - logger.WithValues("pod CIDR IPV4", podCIDRIPv4).WithValues("pod CIDR IPv6", podCIDRIPv6).Info("pod CIDR") - - // Init iptables - if err := initIptables(logger); err != nil { - logger.Error(err, "failed to init iptables") - os.Exit(1) - } - - // Get all services - svcs := &corev1.ServiceList{} - if err := r.Client.List(ctx, svcs, client.InNamespace("")); err != nil { - logger.Error(err, "failed to get all services from API server") + configRuleExternalCluster, err = configs.GetConfigRuleExternalCluster() + if err != nil { + logger.Error(err, "config error") os.Exit(1) } - - // Cleanup iptables for deleted services - if err := cleanupIptables(logger, svcs, podCIDRIPv4, podCIDRIPv6); err != nil { - logger.Error(err, "failed to cleanup iptables") + configRuleDropInvalidInput, err = configs.GetConfigRuleDropInvalidInput() + if err != nil { + logger.Error(err, "config error") os.Exit(1) } - } - // Get service info - svc := &corev1.Service{} - if err := r.Client.Get(ctx, req.NamespacedName, svc); err != nil { - if apierror.IsNotFound(err) { - // Not found service means that the service is removed. - // Delete iptables rules by using cache - - // Get service from cache - oldSvc, exist := serviceCache[req] - if !exist { - // If there is no service info in cache, skip it - return ctrl.Result{}, nil - } - - // Delete iptables rules - for _, oldIngress := range oldSvc.Status.LoadBalancer.Ingress { - oldClusterIP := oldSvc.Spec.ClusterIP - oldExternalIP := oldIngress.IP - - // Delete iptables rules - logger.WithValues("externalIP", oldExternalIP).Info("delete iptables rules") - if err := deleteIptablesRules(logger, &req, oldClusterIP, oldExternalIP, podCIDRIPv4, podCIDRIPv6); err != nil { - return ctrl.Result{}, err - } + logger.WithValues("node name", configNodeName).Info("config node name") + logger.WithValues("IPv4", configIPv4Enabled).WithValues("IPv6", configIPv6Enabled).Info("config network stack") + logger.WithValues("flag", configRuleExternalCluster).Info("config rule externalIP to clusterIP") + logger.WithValues("flag", configRuleDropInvalidInput).Info("config rule drop invalid in input chain") + + // Run rules first + if configRuleDropInvalidInput { + if err := createRulesDropInvalidInput(logger); err != nil { + logger.Error(err, "failed to create rule drop invalid input") + os.Exit(1) } - return ctrl.Result{}, nil } else { - logger.Error(err, "failed to get service info") - return ctrl.Result{}, err - } - } - - // Check service is LoadBalancer type - if svc.Spec.Type != corev1.ServiceTypeLoadBalancer { - return ctrl.Result{}, nil - } - - // Create iptables rules - for _, ingress := range svc.Status.LoadBalancer.Ingress { - clusterIP := svc.Spec.ClusterIP - externalIP := ingress.IP - - // Cache service to use when deleting service - serviceCache[req] = *svc.DeepCopy() - - // Create iptables rules - logger.WithValues("externalIP", externalIP).Info("create iptables rules") - if err := createIptablesRules(logger, &req, clusterIP, externalIP, podCIDRIPv4, podCIDRIPv6); err != nil { - return ctrl.Result{}, err - } - } - - return ctrl.Result{}, nil -} - -func (r *ServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { - // Set controller manager - return ctrl.NewControllerManagedBy(mgr). - For(&corev1.Service{}). - Complete(r) -} - -func initIptables(logger logr.Logger) error { - logger.Info("create iptables chains") - - // IPv4 - if configIPv4Enabled { - // Create chain in nat table - logger.Info("create the IPVS IPv4 chains") - out, err := iptables.CreateChainIPv4(iptables.TableNAT, ChainNATIPVSLBPrerouting) - if err != nil { - logger.Error(err, out) - return err - } - out, err = iptables.CreateChainIPv4(iptables.TableNAT, ChainNATIPVSLBOutput) - if err != nil { - logger.Error(err, out) - return err - } - - // Set jump rule to each chain in nat table - logger.Info("create jump rules for the IPVS IPv4 chains") - ruleJumpPre := []string{"-j", ChainNATIPVSLBPrerouting} - out, err = iptables.CreateRuleFirstIPv4(iptables.TableNAT, ChainNATPrerouting, "", ruleJumpPre...) - if err != nil { - logger.Error(err, out) - return err - } - ruleJumpOut := []string{"-j", ChainNATIPVSLBOutput} - out, err = iptables.CreateRuleFirstIPv4(iptables.TableNAT, ChainNATOutput, "", ruleJumpOut...) - if err != nil { - logger.Error(err, out) - return err - } - } - // IPv6 - if configIPv6Enabled { - // Create chain in nat table - logger.Info("create the IPVS IPv6 chains") - out, err := iptables.CreateChainIPv6(iptables.TableNAT, ChainNATIPVSLBPrerouting) - if err != nil { - logger.Error(err, out) - return err - } - out, err = iptables.CreateChainIPv6(iptables.TableNAT, ChainNATIPVSLBOutput) - if err != nil { - logger.Error(err, out) - return err - } - - // Set jump rule to each chain in nat table - logger.Info("create jump rules for the IPVS IPv6 chains") - ruleJumpPre := []string{"-j", ChainNATIPVSLBPrerouting} - out, err = iptables.CreateRuleFirstIPv6(iptables.TableNAT, ChainNATPrerouting, "", ruleJumpPre...) - if err != nil { - logger.Error(err, out) - return err - } - ruleJumpOut := []string{"-j", ChainNATIPVSLBOutput} - out, err = iptables.CreateRuleFirstIPv6(iptables.TableNAT, ChainNATOutput, "", ruleJumpOut...) - if err != nil { - logger.Error(err, out) - return err - } - } - - return nil -} - -func cleanupIptables(logger logr.Logger, svcs *corev1.ServiceList, podCIDRIPv4, podCIDRIPv6 string) error { - logger.Info("cleanup iptables for deleted services") - - // IPv4 - if configIPv4Enabled { - // Make up service map - svcMap := make(map[string]*corev1.Service) - for _, svc := range svcs.Items { - if ip.IsIPv4Addr(svc.Spec.ClusterIP) && svc.Spec.Type == corev1.ServiceTypeLoadBalancer { - svcMap[svc.Namespace+"/"+svc.Name] = svc.DeepCopy() + if err := deleteRulesDropInvalidInput(logger); err != nil { + logger.Error(err, "failed to delete rule drop invalid input") + os.Exit(1) } } - // Cleanup prerouting chain - preRules, err := iptables.GetRulesIPv4(iptables.TableNAT, ChainNATIPVSLBPrerouting) - if err != nil { - return err - } - for _, rule := range preRules { - // Get service info from rule and k8s, and delete iptables rules - nsName, src, dest, jump, dnatDest := getSvcInfoFromRule(rule) - svc, ok := svcMap[nsName] - if !ok { - logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup prerouting chain IPv4 rule") - out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - if err != nil { - logger.Error(err, out) + // Run rules periodically + ticker := time.NewTicker(60 * time.Second) + go func() { + for { + <-ticker.C + if configRuleDropInvalidInput { + if err := createRulesDropInvalidInput(logger); err != nil { + logger.Error(err, "failed to cleanup rule drop invalid input") + } } - continue } + }() - // Compare service info and delete iptables rules - for _, ingress := range svc.Status.LoadBalancer.Ingress { - externalIP := ingress.IP + "/32" - if (jump == ChainNATKubeMasquerade && (src == podCIDRIPv4 && dest == externalIP)) || - (jump == "DNAT" && (src == podCIDRIPv4 && dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { - continue - } - logger.WithValues("rule", rule).Info("service info is diff. cleanup prerouting chain IPv4 rule") - out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - if err != nil { - logger.Error(err, out) - } + if configRuleExternalCluster { + // Get nodes's pod CIDR + node := &corev1.Node{} + if err := r.Client.Get(ctx, types.NamespacedName{Name: configNodeName}, node); err != nil { + logger.Error(err, "failed to get the pod's node info from API server") + os.Exit(1) } - } + podCIDRIPv4, podCIDRIPv6 = getPodCIDR(node.Spec.PodCIDRs) + logger.WithValues("pod CIDR IPV4", podCIDRIPv4).WithValues("pod CIDR IPv6", podCIDRIPv6).Info("pod CIDR") - // Cleanup output chain - outRules, err := iptables.GetRulesIPv4(iptables.TableNAT, ChainNATIPVSLBOutput) - if err != nil { - return err - } - for _, rule := range outRules { - // Get service info from rule and k8s, and delete iptables rules - nsName, _, dest, jump, dnatDest := getSvcInfoFromRule(rule) - svc, ok := svcMap[nsName] - if !ok { - logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup output chain IPv4 rule") - iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - continue + // Initialize externalIP to clusterIP rules + if err := initRulesExternalCluster(logger); err != nil { + logger.Error(err, "failed to initalize rule externalIP to clusterIP") + os.Exit(1) } - // Compare service info and delete diff iptables rules - for _, ingress := range svc.Status.LoadBalancer.Ingress { - externalIP := ingress.IP + "/32" - if (jump == ChainNATKubeMasquerade && dest == externalIP) || - (jump == "DNAT" && (dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { - continue - } - logger.WithValues("rule", rule).Info("service info is diff. cleanup output chain IPv4 rule") - out, err := iptables.DeleteRuleRawIPv4(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - if err != nil { - logger.Error(err, out) - } + // Get all services + svcs := &corev1.ServiceList{} + if err := r.Client.List(ctx, svcs, client.InNamespace("")); err != nil { + logger.Error(err, "failed to get all services from API server") + os.Exit(1) } - } - } - // IPv6 - if configIPv6Enabled { - // Make up service map - svcMap := make(map[string]*corev1.Service) - for _, svc := range svcs.Items { - if ip.IsIPv6Addr(svc.Spec.ClusterIP) && svc.Spec.Type == corev1.ServiceTypeLoadBalancer { - svcMap[svc.Namespace+"/"+svc.Name] = svc.DeepCopy() - } - } - // Cleanup prerouting chain - preRules, err := iptables.GetRulesIPv6(iptables.TableNAT, ChainNATIPVSLBPrerouting) - if err != nil { - return err - } - for _, rule := range preRules { - // Get service info from rule and k8s, and delete iptables rules - nsName, src, dest, jump, dnatDest := getSvcInfoFromRule(rule) - svc, ok := svcMap[nsName] - if !ok { - logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup prerouting chain IPv6 rule") - out, err := iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - if err != nil { - logger.Error(err, out) - } - continue + // Cleanup externalIP to clusterIP rules for deleted services + if err := cleanupRulesExternalCluster(logger, svcs, podCIDRIPv4, podCIDRIPv6); err != nil { + logger.Error(err, "failed to cleanup rule externalIP to clusterIP") + os.Exit(1) } - - // Compare service info and delete iptables rules - for _, ingress := range svc.Status.LoadBalancer.Ingress { - externalIP := ingress.IP + "/128" - if (jump == ChainNATKubeMasquerade && (src == podCIDRIPv6 && dest == externalIP)) || - (jump == "DNAT" && (src == podCIDRIPv6 && dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { - continue - } - logger.WithValues("rule", rule).Info("service info is diff. cleanup prerouting chain IPv6 rule") - out, err := iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - if err != nil { - logger.Error(err, out) - } + } else { + // Destroy externalIP to clusterIP rules + if err := destoryRulesExternalCluster(logger); err != nil { + logger.Error(err, "failed to destroy rule externalIP to clusterIP") + os.Exit(1) } } + } - // Cleanup output - outRules, err := iptables.GetRulesIPv6(iptables.TableNAT, ChainNATIPVSLBOutput) - if err != nil { - return err + // Loop + if configRuleExternalCluster { + // In case the iptables chain is deleted, initalize again + if err := initRulesExternalCluster(logger); err != nil { + logger.Error(err, "failed to initalize rule externalIP to clusterIP") + os.Exit(1) } - for _, rule := range outRules { - // Get service info from rule and k8s, and delete iptables rules - nsName, _, dest, jump, dnatDest := getSvcInfoFromRule(rule) - svc, ok := svcMap[nsName] - if !ok { - logger.WithValues("rule", rule).Info("there is no service info in k8s. cleanup output chain IPv6 rule") - iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - continue - } - // Compare service info and delete diff iptables rules - for _, ingress := range svc.Status.LoadBalancer.Ingress { - externalIP := ingress.IP + "/128" - if (jump == ChainNATKubeMasquerade && dest == externalIP) || - (jump == "DNAT" && (dest == externalIP && dnatDest == svc.Spec.ClusterIP)) { - continue + // Get service info + svc := &corev1.Service{} + if err := r.Client.Get(ctx, req.NamespacedName, svc); err != nil { + if apierror.IsNotFound(err) { + // Not found service means that the service is removed. + // Delete iptables rules by using cache + + // Get service from cache + oldSvc, exist := serviceCache[req] + if !exist { + // If there is no service info in cache, skip it + return ctrl.Result{}, nil } - logger.WithValues("rule", rule).Info("service info is diff. cleanup output chain IPv6 rule") - iptables.DeleteRuleRawIPv6(iptables.TableNAT, iptables.ChangeRuleToDelete(rule)...) - } - } - } - return nil -} + // Delete rules + for _, oldIngress := range oldSvc.Status.LoadBalancer.Ingress { + oldClusterIP := oldSvc.Spec.ClusterIP + oldExternalIP := oldIngress.IP -func createIptablesRules(logger logr.Logger, req *ctrl.Request, clusterIP, externalIP, podCIDRIPv4, podCIDRIPv6 string) error { - // Don't use spec.ipFamily to distingush between IPv4 and IPv6 Address - // for kubernetes version that dosen't support IPv6 dualstack - if configIPv4Enabled && ip.IsIPv4Addr(clusterIP) { - // IPv4 - // Set prerouting - rulePreMasq := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err := iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreMasq...) - if err != nil { - logger.Error(err, out) - return err - } - rulePreDNAT := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreDNAT...) - if err != nil { - logger.Error(err, out) - return err - } - - // Set output - ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err = iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutMasq...) - if err != nil { - logger.Error(err, out) - return err - } - ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.CreateRuleLastIPv4(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutDNAT...) - if err != nil { - logger.Error(err, out) - return err - } - } else if configIPv6Enabled && ip.IsIPv6Addr(clusterIP) { - // IPv6 - // Set prerouting - rulePreMasq := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err := iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreMasq...) - if err != nil { - logger.Error(err, out) - return err - } - rulePreDNAT := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreDNAT...) - if err != nil { - logger.Error(err, out) - return err + // Delete rules + logger.WithValues("externalIP", oldExternalIP).WithValues("clusterIP", oldClusterIP).Info("delete rule externalIp to clusterIP") + if err := deleteRulesExternalCluster(logger, &req, oldClusterIP, oldExternalIP, podCIDRIPv4, podCIDRIPv6); err != nil { + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil + } else { + logger.Error(err, "failed to get service info") + return ctrl.Result{}, err + } } - // Set output - ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err = iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutMasq...) - if err != nil { - logger.Error(err, out) - return err - } - ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.CreateRuleLastIPv6(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutDNAT...) - if err != nil { - logger.Error(err, out) - return err - } - } else { - if ip.IsVaildIP(clusterIP) { - logger.WithValues("clusterIP", clusterIP).Error(errors.New("invalid IP"), "invaild IP") + // Check service is LoadBalancer type + if svc.Spec.Type != corev1.ServiceTypeLoadBalancer { + return ctrl.Result{}, nil } - } - return nil -} + // Create rules + for _, ingress := range svc.Status.LoadBalancer.Ingress { + clusterIP := svc.Spec.ClusterIP + externalIP := ingress.IP -func deleteIptablesRules(logger logr.Logger, req *ctrl.Request, clusterIP, externalIP, podCIDRIPv4, podCIDRIPv6 string) error { - // Don't use spec.ipFamily to distingush between IPv4 and IPv6 Address - // for kubernetes version that dosen't support IPv6 dualstack - if configIPv4Enabled && ip.IsIPv4Addr(clusterIP) { - // IPv4 - // Unset prerouting - rulePreMasq := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err := iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreMasq...) - if err != nil { - logger.Error(err, out) - return err - } - rulePreDNAT := []string{"-s", podCIDRIPv4, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreDNAT...) - if err != nil { - logger.Error(err, out) - return err - } + // Cache service to use deleting service + serviceCache[req] = *svc.DeepCopy() - // Unset output - ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutMasq...) - if err != nil { - logger.Error(err, out) - return err - } - ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.DeleteRuleIPv4(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutDNAT...) - if err != nil { - logger.Error(err, out) - return err - } - } else if configIPv6Enabled && ip.IsIPv6Addr(clusterIP) { - // IPv6 - // Unset prerouting - rulePreMasq := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err := iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreMasq...) - if err != nil { - logger.Error(err, out) - return err - } - rulePreDNAT := []string{"-s", podCIDRIPv6, "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATIPVSLBPrerouting, req.String(), rulePreDNAT...) - if err != nil { - logger.Error(err, out) - return err - } - - // Unset output - ruleOutMasq := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", ChainNATKubeMasquerade} - out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutMasq...) - if err != nil { - logger.Error(err, out) - return err - } - ruleOutDNAT := []string{"-m", "addrtype", "--src-type", "LOCAL", "-d", externalIP, "-j", "DNAT", "--to-destination", clusterIP} - out, err = iptables.DeleteRuleIPv6(iptables.TableNAT, ChainNATIPVSLBOutput, req.String(), ruleOutDNAT...) - if err != nil { - logger.Error(err, out) - return err - } - } else { - if ip.IsVaildIP(clusterIP) { - logger.WithValues("clusterIP", clusterIP).Error(errors.New("invalid IP"), "invaild IP") + // Create rules + logger.WithValues("externalIP", externalIP).WithValues("clusterIP", clusterIP).Info("create iptables rules") + if err := createRulesExternalCluster(logger, &req, clusterIP, externalIP, podCIDRIPv4, podCIDRIPv6); err != nil { + return ctrl.Result{}, err + } } } - return nil -} -func getPodCIDR(cidrs []string) (ipv4CIDR string, ipv6CIDR string) { - for _, cidr := range cidrs { - addr := strings.Split(cidr, "/")[0] - if ip.IsIPv4Addr(addr) { - ipv4CIDR = cidr - } else if ip.IsIPv6Addr(addr) { - ipv6CIDR = cidr - } - } - return + return ctrl.Result{}, nil } -func getSvcInfoFromRule(rule string) (nsName, src, dest, jump, dnatDest string) { - nsName = iptables.GetRuleComment(rule) - src = iptables.GetRuleSrc(rule) - dest = iptables.GetRuleDest(rule) - jump = iptables.GetRuleJump(rule) - dnatDest = iptables.GetRuleDNATDest(rule) - return +func (r *ServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Set controller manager + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.Service{}). + Complete(r) } diff --git a/deploy/network-node-manager.yml b/deploy/network-node-manager_iptables.yml similarity index 100% rename from deploy/network-node-manager.yml rename to deploy/network-node-manager_iptables.yml diff --git a/deploy/network-node-manager_ipvs.yml b/deploy/network-node-manager_ipvs.yml new file mode 100644 index 0000000..f1cf0eb --- /dev/null +++ b/deploy/network-node-manager_ipvs.yml @@ -0,0 +1,97 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: network-node-manager-role +rules: +- apiGroups: + - "" + resources: + - services + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - services/status + verbs: + - get + - patch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: network-node-manager + namespace: kube-system + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: network-node-manager-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: network-node-manager-role +subjects: +- kind: ServiceAccount + name: network-node-manager + namespace: kube-system + +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: network-node-manager + namespace: kube-system + labels: + control-plane: network-node-manager +spec: + selector: + matchLabels: + control-plane: network-node-manager + template: + metadata: + labels: + control-plane: network-node-manager + spec: + serviceAccountName: network-node-manager + hostNetwork: true + containers: + - command: + - /network-node-manager + args: + - --metrics-addr=0 + image: kakaocorp/network-node-manager:latest + name: network-node-manager + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: RULE_EXTERNAL_CLUSTER + value: "true" + securityContext: + capabilities: + add: ["NET_ADMIN"] + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master diff --git a/img/network-node-manager.pptx b/img/network-node-manager.pptx new file mode 100644 index 0000000..7b4408a Binary files /dev/null and b/img/network-node-manager.pptx differ diff --git a/img/network-node-manager_Architecture.PNG b/img/network-node-manager_Architecture.PNG new file mode 100644 index 0000000..503b184 Binary files /dev/null and b/img/network-node-manager_Architecture.PNG differ diff --git a/issues/connection_reset_issue_pod_out_cluster.md b/issues/connection_reset_issue_pod_out_cluster.md new file mode 100644 index 0000000..2c6ab3e --- /dev/null +++ b/issues/connection_reset_issue_pod_out_cluster.md @@ -0,0 +1,27 @@ +# Connection Reset Issue between Pod and Out of Cluster + +In general, when a container transmits a packet to outside the cluster, the packet is SNAT'ed. After that, when receiving a response packet from outside the container, the response packet must be DNAT'ed and delivered to the pod. However, due to the conntrack bug of the kernel, the response packet is regarded as an invliad packet, so there is a problem that the respose packet is not DNAT'ed and transmiited to the host. The host considers the response packet sent to the host as an incorrectly transmitted packet and disconnects the TCP connection by sending a reset packet. + +## How to solve it + +network-node-manager adds DROP rule for invalid packet to INPUT chain in filter table. Below are rules set by network-node-manager. + +``` +$ iptables -nvL -t filter +... +Chain INPUT (policy ACCEPT 3229 packets, 597K bytes) + pkts bytes target prot opt in out source destination + 246K 56M NMANAGER_INPUT all -- * * 0.0.0.0/0 0.0.0.0/0 +... +Chain NMANAGER_INPUT (1 references) + pkts bytes target prot opt in out source destination + 246K 56M NMANAGER_DROP_INVALID_INPUT all -- * * 0.0.0.0/0 0.0.0.0/0 + +Chain NMANAGER_DROP_INVALID_INPUT (1 references) + pkts bytes target prot opt in out source destination + 0 0 DROP all -- * * 0.0.0.0/0 0.0.0.0/0 ctstate INVALID +``` + +## References + +* https://github.com/moby/libnetwork/issues/1090 diff --git a/issues/external_IP_access_issue_IPVS_proxy_mode.md b/issues/external_IP_access_issue_IPVS_proxy_mode.md new file mode 100644 index 0000000..b15b967 --- /dev/null +++ b/issues/external_IP_access_issue_IPVS_proxy_mode.md @@ -0,0 +1,52 @@ +# External-IP Access Issue with IPVS Proxy Mode + +When using IPVS proxy mode, External-IP assigned through the LoadBalancer type service with externalTrafficPolcy=Local option cannot be accessed from inside the cluster. Currently kubernetes is not preparing any patch to fix this issue. + +## How to solve it + +network-node-manager adds two DNAT rules for each LoadBalancer type service. One is added to the prerouting chain and the other is added to the output chain. The DNAT rule in the prerouting chain is for the pod that uses pod-only network namespace. On the other hand, The DNAT rule in the output chain is for the pod that uses host network namespace. All DNAT rules only target packets from pods on the host. Below are example rules set by network-node-manager. + +``` +$ kubectl -n default get service +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +lb-service-1 LoadBalancer 10.231.42.164 10.19.20.201 80:31751/TCP,443:30126/TCP 16d +lb-service-2 LoadBalancer 10.231.2.62 10.19.22.57 80:32352/TCP,443:31549/TCP 16d + +$ iptables -nvL -t nat +... +Chain PREROUTING (policy ACCEPT 0 packets, 0 bytes) + pkts bytes target prot opt in out source destination + 190 20813 NMANAGER_PREROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 + +Chain OUTPUT (policy ACCEPT 3 packets, 180 bytes) + pkts bytes target prot opt in out source destination + 651 41106 NMANAGER_OUTPUT all -- * * 0.0.0.0/0 0.0.0.0/0 +... +Chain NMANAGER_OUTPUT (1 references) + pkts bytes target prot opt in out source destination + 650 41046 NMANAGER_EX_CLUS_OUTPUT all -- * * 0.0.0.0/0 0.0.0.0/0 + +Chain NMANAGER_PREROUTING (1 references) + pkts bytes target prot opt in out source destination + 190 20813 NMANAGER_EX_CLUS_PREROUTING all -- * * 0.0.0.0/0 0.0.0.0/0 +... +Chain NMANAGER_EX_CLUS_OUTPUT (1 references) + pkts bytes target prot opt in out source destination + 0 0 KUBE-MARK-MASQ all -- * * 0.0.0.0/0 10.19.20.201 /* default/lb-service-1 */ ADDRTYPE match src-type LOCAL + 0 0 DNAT all -- * * 0.0.0.0/0 10.19.20.201 /* default/lb-service-1 */ ADDRTYPE match src-type LOCAL to:10.231.42.164 + 2 120 KUBE-MARK-MASQ all -- * * 0.0.0.0/0 10.19.22.57 /* default/lb-service-2 */ ADDRTYPE match src-type LOCAL + 2 120 DNAT all -- * * 0.0.0.0/0 10.19.22.57 /* default/lb-service-2 */ ADDRTYPE match src-type LOCAL to:10.231.2.62 + +Chain NMANAGER_EX_CLUS_PREROUTING (1 references) + pkts bytes target prot opt in out source destination + 0 0 KUBE-MARK-MASQ all -- * * 10.240.2.128/25 10.19.20.201 /* default/lb-service-1 */ + 0 0 DNAT all -- * * 10.240.2.128/25 10.19.20.201 /* default/lb-service-1 */ to:10.231.42.164 + 1 60 KUBE-MARK-MASQ all -- * * 10.240.2.128/25 10.19.22.57 /* default/lb-service-2 */ + 1 60 DNAT all -- * * 10.240.2.128/25 10.19.22.57 /* default/lb-service-2 */ to:10.231.2.62 +... +``` + +## References + +* https://github.com/kubernetes/kubernetes/issues/75262 + diff --git a/main.go b/main.go index 6843d13..bd0d4c0 100644 --- a/main.go +++ b/main.go @@ -50,6 +50,7 @@ func main() { ctrl.SetLogger(zap.New(zap.UseDevMode(true))) + // Initalize controller manager mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, MetricsBindAddress: metricsAddr, @@ -62,6 +63,7 @@ func main() { os.Exit(1) } + // Initialize service controller if err = (&controllers.ServiceReconciler{ Client: mgr.GetClient(), Log: ctrl.Log.WithName("controllers").WithName("Service"), @@ -72,6 +74,7 @@ func main() { } // +kubebuilder:scaffold:builder + // Run service controller setupLog.Info("starting manager") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "problem running manager") diff --git a/pkg/configs/configs.go b/pkg/configs/configs.go index 79e70bd..4a3d177 100644 --- a/pkg/configs/configs.go +++ b/pkg/configs/configs.go @@ -7,11 +7,16 @@ import ( ) const ( - EnvNodeName = "NODE_NAME" + EnvNodeName = "NODE_NAME" + EnvConfigTrue = "true" + EnvConfigFalse = "false" EnvNetStack = "NET_STACK" EnvNetStackIPv4 = "ipv4" EnvNetStackIPv6 = "ipv6" + + EnvRuleExternalCluster = "RULE_EXTERNAL_CLUSTER" + EnvRuleDropInvalidInput = "RULE_DROP_INVALID_INPUT" ) func GetConfigNodeName() (string, error) { @@ -41,8 +46,42 @@ func GetConfigNetStack() (bool, bool, error) { } else if config == EnvNetStackIPv6 { ipv6 = true } else { - return false, false, fmt.Errorf("wrong network stack config : %s", config) + return false, false, fmt.Errorf("wrong config for network stack : %s", config) } } return ipv4, ipv6, nil } + +func GetConfigRuleExternalCluster() (bool, error) { + // organize configs + config := os.Getenv(EnvRuleExternalCluster) + config = strings.ToLower(config) + if config == "" { + return false, nil + } + + // return configs + if config == EnvConfigFalse { + return false, nil + } else if config == EnvConfigTrue { + return true, nil + } + return false, fmt.Errorf("wrong config for externalIP to clusterIP DNAT : %s", config) +} + +func GetConfigRuleDropInvalidInput() (bool, error) { + // organize configs + config := os.Getenv(EnvRuleDropInvalidInput) + config = strings.ToLower(config) + if config == "" { + return true, nil + } + + // return configs + if config == EnvConfigFalse { + return false, nil + } else if config == EnvConfigTrue { + return true, nil + } + return false, fmt.Errorf("wrong config for drop invalid packet in INPUT chain : %s", config) +} diff --git a/pkg/configs/configs_test.go b/pkg/configs/configs_test.go index 238f620..70ae957 100644 --- a/pkg/configs/configs_test.go +++ b/pkg/configs/configs_test.go @@ -62,3 +62,29 @@ func TestGetConfigNetStack(t *testing.T) { t.Errorf("wrong result - %s", "ipv6,ipv4") } } + +func TestGetConfigRuleExternalCluster(t *testing.T) { + os.Setenv(EnvRuleExternalCluster, "") + flag, _ := GetConfigRuleExternalCluster() + if flag { + t.Errorf("wrong result - %s", "") + } + + os.Setenv(EnvRuleExternalCluster, "false") + flag, _ = GetConfigRuleExternalCluster() + if flag { + t.Errorf("wrong result - %s", "false") + } + + os.Setenv(EnvRuleExternalCluster, "true") + flag, _ = GetConfigRuleExternalCluster() + if !flag { + t.Errorf("wrong result - %s", "true") + } + + os.Setenv(EnvRuleExternalCluster, "none") + _, err := GetConfigRuleExternalCluster() + if err == nil { + t.Errorf("wrong result - %s", "none") + } +} diff --git a/pkg/iptables/iptables.go b/pkg/iptables/iptables.go index deadc50..8bc80f1 100644 --- a/pkg/iptables/iptables.go +++ b/pkg/iptables/iptables.go @@ -16,7 +16,9 @@ const ( iptablesCmdIPv6 = "ip6tables" iptablesSaveCmdIPv4 = "iptables-save" iptablesSaveCmdIPv6 = "ip6tables-save" + iptablesErrNoRule = "No chain/target/match by that name" + iptablesErrNoTarget = "Couldn't load target" TableNAT Table = "nat" TableFilter Table = "filter" @@ -113,6 +115,13 @@ func deleteChain(iptablesCmd string, table Table, chain string) (string, error) return string(out), nil } + // Flush chain + cmd = exec.Command(iptablesCmd, append(args, "-F", chain)...) + out, err = cmd.CombinedOutput() + if err != nil { + return string(out), err + } + // Delete chain cmd = exec.Command(iptablesCmd, append(args, "-X", chain)...) out, err = cmd.CombinedOutput() @@ -288,6 +297,9 @@ func deleteRule(iptablesCmd string, table Table, chain string, comment string, r if strings.Contains(string(out), iptablesErrNoRule) { // If rule isn't exist, return success return string(out), nil + } else if strings.Contains(string(out), iptablesErrNoTarget) { + // If target isn't exit, return success + return string(out), nil } return string(out), err }