diff --git a/.drone.jsonnet b/.drone.jsonnet index a67792332..810203a9e 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -450,7 +450,13 @@ local integration_cilium_strict = Step('e2e-cilium-strict', target='e2e-qemu', p WITH_CONFIG_PATCH: '[{"op": "add", "path": "/cluster/network", "value": {"cni": {"name": "none"}}}, {"op": "add", "path": "/cluster/proxy", "value": {"disabled": true}}]', IMAGE_REGISTRY: local_registry, }); -local integration_canal_reset = Step('e2e-canal-reset', target='e2e-qemu', privileged=true, depends_on=[load_artifacts], environment={ + +local integration_network_chaos = Step('e2e-network-chaos', target='e2e-qemu', privileged=true, depends_on=[load_artifacts], environment={ + SHORT_INTEGRATION_TEST: 'yes', + WITH_NETWORK_CHAOS: 'true', + REGISTRY: local_registry, +}); +local integration_canal_reset = Step('e2e-canal-reset', target='e2e-qemu', privileged=true, depends_on=[integration_network_chaos], environment={ INTEGRATION_TEST_RUN: 'TestIntegration/api.ResetSuite/TestResetWithSpec', CUSTOM_CNI_URL: 'https://raw.githubusercontent.com/projectcalico/calico/v3.25.0/manifests/canal.yaml', REGISTRY: local_registry, @@ -548,6 +554,7 @@ local integration_pipelines = [ Pipeline('integration-provision-1', default_pipeline_steps + [integration_provision_tests_prepare, integration_provision_tests_track_1]) + integration_trigger(['integration-provision', 'integration-provision-1']), Pipeline('integration-provision-2', default_pipeline_steps + [integration_provision_tests_prepare, integration_provision_tests_track_2]) + integration_trigger(['integration-provision', 'integration-provision-2']), Pipeline('integration-misc', default_pipeline_steps + [ + integration_network_chaos, integration_canal_reset, integration_bios_cgroupsv1, integration_disk_image, diff --git a/cmd/talosctl/cmd/mgmt/cluster/create.go b/cmd/talosctl/cmd/mgmt/cluster/create.go index f73a9fa9f..33811ebc4 100644 --- a/cmd/talosctl/cmd/mgmt/cluster/create.go +++ b/cmd/talosctl/cmd/mgmt/cluster/create.go @@ -154,6 +154,13 @@ var ( controlPlanePort int dhcpSkipHostname bool skipBootPhaseFinishedCheck bool + networkChaos bool + jitter time.Duration + latency time.Duration + packetLoss float64 + packetReorder float64 + packetCorrupt float64 + bandwidth int ) // createCmd represents the cluster up command. @@ -269,6 +276,13 @@ func create(ctx context.Context, flags *pflag.FlagSet) (err error) { } } + // Validate network chaos flags + if !networkChaos { + if jitter != 0 || latency != 0 || packetLoss != 0 || packetReorder != 0 || packetCorrupt != 0 || bandwidth != 0 { + return fmt.Errorf("network chaos flags can only be used with --with-network-chaos") + } + } + provisioner, err := providers.Factory(ctx, provisionerName) if err != nil { return err @@ -296,6 +310,13 @@ func create(ctx context.Context, flags *pflag.FlagSet) (err error) { }, DHCPSkipHostname: dhcpSkipHostname, DockerDisableIPv6: dockerDisableIPv6, + NetworkChaos: networkChaos, + Jitter: jitter, + Latency: latency, + PacketLoss: packetLoss, + PacketReorder: packetReorder, + PacketCorrupt: packetCorrupt, + Bandwidth: bandwidth, }, Image: nodeImage, @@ -945,6 +966,15 @@ func init() { createCmd.Flags().IntVar(&controlPlanePort, controlPlanePortFlag, constants.DefaultControlPlanePort, "control plane port (load balancer and local API port)") createCmd.Flags().BoolVar(&dhcpSkipHostname, "disable-dhcp-hostname", false, "skip announcing hostname via DHCP (QEMU only)") createCmd.Flags().BoolVar(&skipBootPhaseFinishedCheck, "skip-boot-phase-finished-check", false, "skip waiting for node to finish boot phase") + createCmd.Flags().BoolVar(&networkChaos, "with-network-chaos", false, "enable to use network chaos parameters when creating a qemu cluster") + createCmd.Flags().DurationVar(&jitter, "with-network-jitter", 0, "specify jitter on the bridge interface when creating a qemu cluster") + createCmd.Flags().DurationVar(&latency, "with-network-latency", 0, "specify latency on the bridge interface when creating a qemu cluster") + createCmd.Flags().Float64Var(&packetLoss, "with-network-packet-loss", 0.0, "specify percent of packet loss on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)") + createCmd.Flags().Float64Var(&packetReorder, "with-network-packet-reorder", 0.0, + "specify percent of reordered packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)") + createCmd.Flags().Float64Var(&packetCorrupt, "with-network-packet-corrupt", 0.0, + "specify percent of corrupt packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0)") + createCmd.Flags().IntVar(&bandwidth, "with-network-bandwidth", 0, "specify bandwidth restriction (in kbps) on the bridge interface when creating a qemu cluster") Cmd.AddCommand(createCmd) } diff --git a/hack/test/e2e-qemu.sh b/hack/test/e2e-qemu.sh index 34eb62320..589c78228 100755 --- a/hack/test/e2e-qemu.sh +++ b/hack/test/e2e-qemu.sh @@ -80,6 +80,14 @@ case "${DISABLE_DHCP_HOSTNAME:-false}" in ;; esac +case "${WITH_NETWORK_CHAOS:-false}" in + false) + ;; + *) + QEMU_FLAGS="${QEMU_FLAGS} --with-network-chaos --with-network-packet-loss=0.01 --with-network-latency=15ms --with-network-jitter=5ms" + ;; +esac + case "${USE_DISK_IMAGE:-false}" in false) DISK_IMAGE_FLAG= diff --git a/pkg/cluster/bootstrap.go b/pkg/cluster/bootstrap.go index 765e6f901..9c53d4a1f 100644 --- a/pkg/cluster/bootstrap.go +++ b/pkg/cluster/bootstrap.go @@ -54,8 +54,8 @@ func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error { fmt.Fprintln(out, "waiting for API") - err = retry.Constant(5*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error { - retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond) + err = retry.Constant(10*time.Minute, retry.WithUnits(500*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error { + retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second) defer cancel() if _, err = cli.Version(retryCtx); err != nil { @@ -72,7 +72,7 @@ func (s *APIBootstrapper) Bootstrap(ctx context.Context, out io.Writer) error { fmt.Fprintln(out, "bootstrapping cluster") return retry.Constant(backoff.DefaultConfig.MaxDelay, retry.WithUnits(100*time.Millisecond)).RetryWithContext(nodeCtx, func(nodeCtx context.Context) error { - retryCtx, cancel := context.WithTimeout(nodeCtx, 500*time.Millisecond) + retryCtx, cancel := context.WithTimeout(nodeCtx, 2*time.Second) defer cancel() if err = cli.Bootstrap(retryCtx, &machineapi.BootstrapRequest{}); err != nil { diff --git a/pkg/provision/providers/qemu/create.go b/pkg/provision/providers/qemu/create.go index 97d49c84e..c016b2edc 100644 --- a/pkg/provision/providers/qemu/create.go +++ b/pkg/provision/providers/qemu/create.go @@ -49,7 +49,7 @@ func (p *provisioner) Create(ctx context.Context, request provision.ClusterReque fmt.Fprintln(options.LogWriter, "creating network", request.Network.Name) - if err = p.CreateNetwork(ctx, state, request.Network); err != nil { + if err = p.CreateNetwork(ctx, state, request.Network, options); err != nil { return nil, fmt.Errorf("unable to provision CNI network: %w", err) } diff --git a/pkg/provision/providers/vm/network.go b/pkg/provision/providers/vm/network.go index 1015b420c..d358fb435 100644 --- a/pkg/provision/providers/vm/network.go +++ b/pkg/provision/providers/vm/network.go @@ -22,6 +22,7 @@ import ( "github.com/jsimonetti/rtnetlink" "github.com/siderolabs/gen/slices" sideronet "github.com/siderolabs/net" + "github.com/vishvananda/netlink" "github.com/siderolabs/talos/pkg/provision" ) @@ -30,8 +31,8 @@ import ( // so that interface name is defined by network name, and different networks have // different bridge interfaces. // -//nolint:gocyclo -func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network provision.NetworkRequest) error { +//nolint:gocyclo,cyclop +func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network provision.NetworkRequest, options provision.Options) error { networkNameHash := sha256.Sum256([]byte(network.Name)) state.BridgeName = fmt.Sprintf("%s%s", "talos", hex.EncodeToString(networkNameHash[:])[:8]) @@ -129,6 +130,82 @@ func (p *Provisioner) CreateNetwork(ctx context.Context, state *State, network p return fmt.Errorf("error parsing VM CNI config: %w", err) } + // configure bridge interface with network chaos if flag is set + if network.NetworkChaos { + if err = p.configureNetworkChaos(network, state, options); err != nil { + return err + } + } + + return nil +} + +//nolint:gocyclo +func (p *Provisioner) configureNetworkChaos(network provision.NetworkRequest, state *State, options provision.Options) error { + if (network.Bandwidth != 0) && (network.Latency != 0 || network.Jitter != 0 || network.PacketLoss != 0 || network.PacketReorder != 0 || network.PacketCorrupt != 0) { + return fmt.Errorf("bandwidth and other chaos options cannot be used together") + } + + link, err := netlink.LinkByName(state.BridgeName) + if err != nil { + return fmt.Errorf("could not get link: %v", err) + } + + fmt.Fprintln(options.LogWriter, "network chaos enabled on interface:", state.BridgeName) + + if network.Bandwidth != 0 { + fmt.Fprintf(options.LogWriter, " bandwidth: %4d kbps\n", network.Bandwidth) + + rate := network.Bandwidth * 1000 / 8 + + buffer := rate / 10 + + limit := buffer * 5 + + qdisc := &netlink.Tbf{ + QdiscAttrs: netlink.QdiscAttrs{ + LinkIndex: link.Attrs().Index, + Handle: netlink.MakeHandle(1, 0), + Parent: netlink.HANDLE_ROOT, + }, + Rate: uint64(rate), + Buffer: uint32(buffer), + Limit: uint32(limit), + } + + if err := netlink.QdiscAdd(qdisc); err != nil { + return fmt.Errorf("could not add netem qdisc: %v", err) + } + } else { + packetLoss := network.PacketLoss * 100 + packetReorder := network.PacketReorder * 100 + packetCorrupt := network.PacketCorrupt * 100 + + fmt.Fprintf(options.LogWriter, " jitter: %4dms\n", network.Jitter.Milliseconds()) + fmt.Fprintf(options.LogWriter, " latency: %4dms\n", network.Latency.Milliseconds()) + fmt.Fprintf(options.LogWriter, " packet loss: %4v%%\n", packetLoss) + fmt.Fprintf(options.LogWriter, " packet reordering: %4v%%\n", packetReorder) + fmt.Fprintf(options.LogWriter, " packet corruption: %4v%%\n", packetCorrupt) + + qdisc := netlink.NewNetem( + netlink.QdiscAttrs{ + LinkIndex: link.Attrs().Index, + Handle: netlink.MakeHandle(1, 0), + Parent: netlink.HANDLE_ROOT, + }, + netlink.NetemQdiscAttrs{ + Jitter: uint32(network.Jitter / 1000), + Latency: uint32(network.Latency / 1000), + Loss: float32(packetLoss), + ReorderProb: float32(packetReorder), + CorruptProb: float32(packetCorrupt), + }, + ) + if err := netlink.QdiscAdd(qdisc); err != nil { + return fmt.Errorf("could not add netem qdisc: %v", err) + } + } + return nil } diff --git a/pkg/provision/request.go b/pkg/provision/request.go index 259738093..ce7561a53 100644 --- a/pkg/provision/request.go +++ b/pkg/provision/request.go @@ -7,6 +7,7 @@ package provision import ( "fmt" "net/netip" + "time" "github.com/siderolabs/go-procfs/procfs" @@ -62,6 +63,15 @@ type NetworkRequest struct { // Docker-specific parameters. DockerDisableIPv6 bool + + // Network chaos parameters. + NetworkChaos bool + Jitter time.Duration + Latency time.Duration + PacketLoss float64 + PacketReorder float64 + PacketCorrupt float64 + Bandwidth int } // NodeRequests is a list of NodeRequest. diff --git a/website/content/v1.5/reference/cli.md b/website/content/v1.5/reference/cli.md index 257ef17ac..3a5bef03c 100644 --- a/website/content/v1.5/reference/cli.md +++ b/website/content/v1.5/reference/cli.md @@ -156,6 +156,13 @@ talosctl cluster create [flags] --with-debug enable debug in Talos config to send service logs to the console --with-init-node create the cluster with an init node --with-kubespan enable KubeSpan system + --with-network-bandwidth int specify bandwidth restriction (in kbps) on the bridge interface when creating a qemu cluster + --with-network-chaos enable to use network chaos parameters when creating a qemu cluster + --with-network-jitter duration specify jitter on the bridge interface when creating a qemu cluster + --with-network-latency duration specify latency on the bridge interface when creating a qemu cluster + --with-network-packet-corrupt float specify percent of corrupt packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0) + --with-network-packet-loss float specify percent of packet loss on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0) + --with-network-packet-reorder float specify percent of reordered packets on the bridge interface when creating a qemu cluster. e.g. 50% = 0.50 (default: 0.0) --with-secureboot enforce secure boot --with-tpm2 enable TPM2 emulation support using swtpm --with-uefi enable UEFI on x86_64 architecture (default true)