From 34820394bb60d17388eb3efd56caee7b63a321cb Mon Sep 17 00:00:00 2001 From: Irbe Krumina Date: Wed, 6 Sep 2023 19:33:52 +0100 Subject: [PATCH] WIP:cmd/k8s-operator: use nftables for proxy Signed-off-by: Irbe Krumina --- Makefile | 2 +- cmd/containerboot/main.go | 107 ++++++++++++++++------- cmd/containerboot/main_test.go | 1 + cmd/k8s-operator/manifests/operator.yaml | 2 + cmd/k8s-operator/manifests/proxy.yaml | 2 + cmd/k8s-operator/operator.go | 6 +- cmd/k8s-operator/sts.go | 5 ++ 7 files changed, 92 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index c4391b90a..a395b46c8 100644 --- a/Makefile +++ b/Makefile @@ -67,7 +67,7 @@ publishdevimage: ## Build and publish tailscale image to location specified by $ @test "${REPO}" != "ghcr.io/tailscale/tailscale" || (echo "REPO=... must not be ghcr.io/tailscale/tailscale" && exit 1) @test "${REPO}" != "tailscale/k8s-operator" || (echo "REPO=... must not be tailscale/k8s-operator" && exit 1) @test "${REPO}" != "ghcr.io/tailscale/k8s-operator" || (echo "REPO=... must not be ghcr.io/tailscale/k8s-operator" && exit 1) - TAGS=latest REPOS=${REPO} PUSH=true TARGET=client ./build_docker.sh + TAGS=latest REPOS=${REPO} PUSH=true TARGET=client BASE="gcr.io/csi-test-290908/alpine:3.18" ./build_docker.sh publishdevoperator: ## Build and publish k8s-operator image to location specified by ${REPO} @test -n "${REPO}" || (echo "REPO=... required; e.g. REPO=ghcr.io/${USER}/tailscale" && exit 1) diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index 3676db02c..755e5cd89 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -658,9 +658,9 @@ func installEgressForwardingRule(ctx context.Context, dstStr string, tsIPs []net if err != nil { return err } - argv0 := "iptables" + family := "ip" if dst.Is6() { - argv0 = "ip6tables" + family = "ip6" } var local string for _, pfx := range tsIPs { @@ -676,26 +676,54 @@ func installEgressForwardingRule(ctx context.Context, dstStr string, tsIPs []net if local == "" { return fmt.Errorf("no tailscale IP matching family of %s found in %v", dstStr, tsIPs) } - // Technically, if the control server ever changes the IPs assigned to this - // node, we'll slowly accumulate iptables rules. This shouldn't happen, so - // for now we'll live with it. - // Set up a rule that ensures that all packets - // except for those received on tailscale0 interface is forwarded to - // destination address - cmdDNAT := exec.CommandContext(ctx, argv0, "-t", "nat", "-I", "PREROUTING", "1", "!", "-i", "tailscale0", "-j", "DNAT", "--to-destination", dstStr) - cmdDNAT.Stdout = os.Stdout - cmdDNAT.Stderr = os.Stderr - if err := cmdDNAT.Run(); err != nil { - return fmt.Errorf("executing iptables failed: %w", err) + + // Ensure a nat table exists. If tailscaled is running in nftables mode + // nat table and POSTROUTING chain will already exist. This is fine as + // creating those is an idempotent operation. + nft := "nft" + natTable := "nat" + cmdAddTable := exec.CommandContext(ctx, nft, "add", "table", family, natTable) + cmdAddTable.Stdout = os.Stdout + cmdAddTable.Stderr = os.Stderr + if err := cmdAddTable.Run(); err != nil { + return fmt.Errorf("adding nat table failed: %w", err) } - // Set up a rule that ensures that all packets sent to the destination - // address will have the proxy's IP set as source IP - cmdSNAT := exec.CommandContext(ctx, argv0, "-t", "nat", "-I", "POSTROUTING", "1", "--destination", dstStr, "-j", "SNAT", "--to-source", local) - cmdSNAT.Stdout = os.Stdout - cmdSNAT.Stderr = os.Stderr - if err := cmdSNAT.Run(); err != nil { - return fmt.Errorf("setting up SNAT via iptables failed: %w", err) + preroutingChain := "PREROUTING" + cmdCreatePreroutingChain := exec.CommandContext(ctx, nft, "--", "add", "chain", family, natTable, preroutingChain, "{ type nat hook prerouting priority -100; policy accept; }") + cmdCreatePreroutingChain.Stdout = os.Stdout + cmdCreatePreroutingChain.Stderr = os.Stderr + if err := cmdCreatePreroutingChain.Run(); err != nil { + return fmt.Errorf("adding prerouting chain failed: %w", err) } + + // running this multiple times will result in multiple rules, but that + // won't affect the functionality and hopefully we'll never run in that + // often that it becomes a performance problem + cmdCreateDNATRule := exec.CommandContext(ctx, nft, "insert", "rule", family, natTable, preroutingChain, "iifname", "!=", "tailscale0*", "dnat", "to", dstStr) + cmdCreateDNATRule.Stdout = os.Stdout + cmdCreateDNATRule.Stderr = os.Stderr + if err := cmdCreateDNATRule.Run(); err != nil { + return fmt.Errorf("adding DNAT rule failed: %w", err) + } + + postRoutingChain := "POSTROUTING" + cmdCreatePostRoutingChain := exec.CommandContext(ctx, nft, "add", "chain", family, natTable, postRoutingChain, "{ type nat hook postrouting priority srcnat; policy accept; }") + cmdCreatePostRoutingChain.Stdout = os.Stdout + cmdCreatePostRoutingChain.Stderr = os.Stderr + if err := cmdCreatePostRoutingChain.Run(); err != nil { + return fmt.Errorf("adding postrouting chain failed: %w", err) + } + + // running this multiple times will result in multiple rules, but that + // won't affect the functionality and hopefully we'll never run in that + // often that it becomes a performance problem + cmdCreateSNATRule := exec.CommandContext(ctx, nft, "insert", "rule", family, natTable, postRoutingChain, "ip", "daddr", dstStr, "snat", "to", local) + cmdCreateSNATRule.Stdout = os.Stdout + cmdCreateSNATRule.Stderr = os.Stderr + if err := cmdCreateSNATRule.Run(); err != nil { + return fmt.Errorf("adding SNAT rule failed: %w", err) + } + return nil } @@ -704,9 +732,9 @@ func installIngressForwardingRule(ctx context.Context, dstStr string, tsIPs []ne if err != nil { return err } - argv0 := "iptables" + family := "ip" if dst.Is6() { - argv0 = "ip6tables" + family = "ip6" } var local string for _, pfx := range tsIPs { @@ -722,15 +750,34 @@ func installIngressForwardingRule(ctx context.Context, dstStr string, tsIPs []ne if local == "" { return fmt.Errorf("no tailscale IP matching family of %s found in %v", dstStr, tsIPs) } - // Technically, if the control server ever changes the IPs assigned to this - // node, we'll slowly accumulate iptables rules. This shouldn't happen, so - // for now we'll live with it. - cmd := exec.CommandContext(ctx, argv0, "-t", "nat", "-I", "PREROUTING", "1", "-d", local, "-j", "DNAT", "--to-destination", dstStr) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - return fmt.Errorf("executing iptables failed: %w", err) + // Create a nat table + nft := "nft" + natTable := "nat" + // adding a table and a chain are idempotent operations + cmdAddTable := exec.CommandContext(ctx, nft, "add", "table", family, natTable) + cmdAddTable.Stdout = os.Stdout + cmdAddTable.Stderr = os.Stderr + if err := cmdAddTable.Run(); err != nil { + return fmt.Errorf("adding nat table failed: %w", err) } + preroutingChain := "PREROUTING" + cmdCreatePreroutingChain := exec.CommandContext(ctx, nft, "--", "add", "chain", family, natTable, preroutingChain, "{ type nat hook prerouting priority -100; policy accept; }") + cmdCreatePreroutingChain.Stdout = os.Stdout + cmdCreatePreroutingChain.Stderr = os.Stderr + if err := cmdAddTable.Run(); err != nil { + return fmt.Errorf("adding prerouting chain failed: %w", err) + } + + // running this multiple times will result in multiple rules, but that + // won't affect the functionality and hopefully we'll never run in that + // often that it becomes a performance problem + cmdCreateDNATRule := exec.CommandContext(ctx, nft, "insert", "rule", family, natTable, preroutingChain, "ip", "daddr", local, "dnat", "to", dstStr) + cmdCreateDNATRule.Stdout = os.Stdout + cmdCreateDNATRule.Stderr = os.Stderr + if err := cmdAddTable.Run(); err != nil { + return fmt.Errorf("adding DNAT rule failed: %w", err) + } + return nil } diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index e1353a8e6..95c30bef0 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -69,6 +69,7 @@ func TestContainerBoot(t *testing.T) { "usr/bin/tailscaled": fakeTailscaled, "usr/bin/tailscale": fakeTailscale, "usr/bin/iptables": fakeTailscale, + "usr/bin/nft": fakeTailscale, "usr/bin/ip6tables": fakeTailscale, "dev/net/tun": []byte(""), "proc/sys/net/ipv4/ip_forward": []byte("0"), diff --git a/cmd/k8s-operator/manifests/operator.yaml b/cmd/k8s-operator/manifests/operator.yaml index 4da7a4dae..40b30ba08 100644 --- a/cmd/k8s-operator/manifests/operator.yaml +++ b/cmd/k8s-operator/manifests/operator.yaml @@ -139,6 +139,8 @@ spec: value: operator - name: OPERATOR_LOGGING value: info + - name: RUN_IN_RESTRICTED_ENV + value: "true" - name: OPERATOR_NAMESPACE valueFrom: fieldRef: diff --git a/cmd/k8s-operator/manifests/proxy.yaml b/cmd/k8s-operator/manifests/proxy.yaml index 361af8910..5f8cd2417 100644 --- a/cmd/k8s-operator/manifests/proxy.yaml +++ b/cmd/k8s-operator/manifests/proxy.yaml @@ -31,6 +31,8 @@ spec: value: "false" - name: TS_AUTH_ONCE value: "true" + - name: TS_DEBUG_FIREWALL_MODE + value: "nftables" securityContext: capabilities: add: diff --git a/cmd/k8s-operator/operator.go b/cmd/k8s-operator/operator.go index 5d78f4c25..177393c33 100644 --- a/cmd/k8s-operator/operator.go +++ b/cmd/k8s-operator/operator.go @@ -53,6 +53,7 @@ func main() { priorityClassName = defaultEnv("PROXY_PRIORITY_CLASS_NAME", "") tags = defaultEnv("PROXY_TAGS", "tag:k8s") shouldRunAuthProxy = defaultBool("AUTH_PROXY", false) + runInRestrictedEnv = defaultBool("RUN_IN_RESTRICTED_ENV", false) ) var opts []kzap.Opts @@ -73,7 +74,7 @@ func main() { if shouldRunAuthProxy { launchAuthProxy(zlog, restConfig, s) } - startReconcilers(zlog, s, tsNamespace, restConfig, tsClient, image, priorityClassName, tags) + startReconcilers(zlog, s, tsNamespace, restConfig, tsClient, image, priorityClassName, tags, runInRestrictedEnv) } // initTSNet initializes the tsnet.Server and logs in to Tailscale. It uses the @@ -182,7 +183,7 @@ waitOnline: // startReconcilers starts the controller-runtime manager and registers the // ServiceReconciler. -func startReconcilers(zlog *zap.SugaredLogger, s *tsnet.Server, tsNamespace string, restConfig *rest.Config, tsClient *tailscale.Client, image, priorityClassName, tags string) { +func startReconcilers(zlog *zap.SugaredLogger, s *tsnet.Server, tsNamespace string, restConfig *rest.Config, tsClient *tailscale.Client, image, priorityClassName, tags string, runInRestrictedEnv bool) { var ( isDefaultLoadBalancer = defaultBool("OPERATOR_DEFAULT_LOAD_BALANCER", false) ) @@ -231,6 +232,7 @@ func startReconcilers(zlog *zap.SugaredLogger, s *tsnet.Server, tsNamespace stri operatorNamespace: tsNamespace, proxyImage: image, proxyPriorityClassName: priorityClassName, + runInRestrictedEnv: runInRestrictedEnv, } err = builder. ControllerManagedBy(mgr). diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go index 31aa6bad7..c5d95e1f2 100644 --- a/cmd/k8s-operator/sts.go +++ b/cmd/k8s-operator/sts.go @@ -78,6 +78,7 @@ type tailscaleSTSReconciler struct { operatorNamespace string proxyImage string proxyPriorityClassName string + runInRestrictedEnv bool } // IsHTTPSEnabledOnTailnet reports whether HTTPS is enabled on the tailnet. @@ -352,6 +353,10 @@ func (a *tailscaleSTSReconciler) reconcileSTS(ctx context.Context, logger *zap.S }, }) } + if a.runInRestrictedEnv { + // in run in restricted env remove the sysctl init container + ss.Spec.Template.Spec.InitContainers = nil + } ss.ObjectMeta = metav1.ObjectMeta{ Name: headlessSvc.Name, Namespace: a.operatorNamespace,