From 1c61130fdfe5b456fbb96ee93e19f2fa6e18184c Mon Sep 17 00:00:00 2001 From: iwilltry42 Date: Thu, 28 Jan 2021 20:44:23 +0100 Subject: [PATCH] clusterStart: sequential and ordered node starts & nodeCreate: do not copy status --- cmd/node/nodeCreate.go | 2 +- pkg/client/cluster.go | 149 ++++++++++++++++++++++------------------- pkg/client/node.go | 24 +++++-- pkg/types/types.go | 7 +- 4 files changed, 104 insertions(+), 78 deletions(-) diff --git a/cmd/node/nodeCreate.go b/cmd/node/nodeCreate.go index 53fa9348..bf96277f 100644 --- a/cmd/node/nodeCreate.go +++ b/cmd/node/nodeCreate.go @@ -50,7 +50,7 @@ func NewCmdNodeCreate() *cobra.Command { nodes, cluster := parseCreateNodeCmd(cmd, args) if err := k3dc.NodeAddToClusterMulti(cmd.Context(), runtimes.SelectedRuntime, nodes, cluster, createNodeOpts); err != nil { log.Errorf("Failed to add nodes to cluster '%s'", cluster.Name) - log.Errorln(err) + log.Fatalln(err) } }, } diff --git a/pkg/client/cluster.go b/pkg/client/cluster.go index d8172674..2e2d6374 100644 --- a/pkg/client/cluster.go +++ b/pkg/client/cluster.go @@ -802,97 +802,106 @@ func generateNodeName(cluster string, role k3d.Role, suffix int) string { func ClusterStart(ctx context.Context, runtime k3drt.Runtime, cluster *k3d.Cluster, startClusterOpts types.ClusterStartOpts) error { log.Infof("Starting cluster '%s'", cluster.Name) - start := time.Now() - if startClusterOpts.Timeout > 0*time.Second { var cancel context.CancelFunc ctx, cancel = context.WithTimeout(ctx, startClusterOpts.Timeout) defer cancel() } + // sort the nodes into categories + var initNode *k3d.Node + var servers []*k3d.Node + var agents []*k3d.Node + var aux []*k3d.Node + for _, n := range cluster.Nodes { + if n.Role == k3d.ServerRole { + if n.ServerOpts.IsInit { + initNode = n + continue + } + servers = append(servers, n) + } else if n.Role == k3d.AgentRole { + agents = append(agents, n) + } else { + aux = append(aux, n) + } + } + + log.Infoln("Servers before sort:") + for i, n := range servers { + log.Infof("Server %d - %s", i, n.Name) + } + sort.Slice(servers, func(i, j int) bool { + return servers[i].Name < servers[j].Name + }) + log.Infoln("Servers after sort:") + for i, n := range servers { + log.Infof("Server %d - %s", i, n.Name) + } + /* * Init Node */ - for _, n := range cluster.Nodes { - if n.Role == k3d.ServerRole && n.ServerOpts.IsInit { - if err := NodeStart(ctx, runtime, n, k3d.NodeStartOpts{ - Wait: true, // always wait for the init node - NodeHooks: startClusterOpts.NodeHooks, - }); err != nil { - return fmt.Errorf("Failed to start initializing server node: %+v", err) - } - break + if initNode != nil { + log.Infoln("Starting the initializing server...") + if err := NodeStart(ctx, runtime, initNode, k3d.NodeStartOpts{ + Wait: true, // always wait for the init node + NodeHooks: startClusterOpts.NodeHooks, + ReadyLogMessage: "Running kube-apiserver", // initNode means, that we're using etcd -> this will need quorum, so "k3s is up and running" won't happen right now + }); err != nil { + return fmt.Errorf("Failed to start initializing server node: %+v", err) } } /* - * Other Nodes + * Server Nodes */ - failed := 0 - var serverlb *k3d.Node - for _, node := range cluster.Nodes { - - // skip the LB, because we want to start it last - if node.Role == k3d.LoadBalancerRole { - serverlb = node - continue - } - - // skip init node here, as it should be running already - if node == cluster.InitNode || node.ServerOpts.IsInit { - continue - } - - // check if node is running already to avoid waiting forever when checking for the node log message - if !node.State.Running { - - nodeStartOpts := k3d.NodeStartOpts{ - NodeHooks: startClusterOpts.NodeHooks, - } - - if node.Role == k3d.ServerRole && startClusterOpts.WaitForServer { - nodeStartOpts.Wait = true - } - - // start node - if err := NodeStart(ctx, runtime, node, nodeStartOpts); err != nil { - log.Warningf("Failed to start node '%s': Try to start it manually", node.Name) - failed++ - continue - } - - } else { - log.Infof("Node '%s' already running", node.Name) + log.Infoln("Starting servers...") + nodeStartOpts := k3d.NodeStartOpts{ + Wait: true, + NodeHooks: startClusterOpts.NodeHooks, + } + for _, serverNode := range servers { + if err := NodeStart(ctx, runtime, serverNode, nodeStartOpts); err != nil { + return fmt.Errorf("Failed to start server %s: %+v", serverNode.Name, err) } } - // start serverlb - if serverlb != nil { - if !serverlb.State.Running { - log.Debugln("Starting serverlb...") - if err := runtime.StartNode(ctx, serverlb); err != nil { // FIXME: we could run into a nullpointer exception here - log.Warningf("Failed to start serverlb '%s' (try to start it manually): %+v", serverlb.Name, err) - failed++ - } - // TODO: avoid `level=fatal msg="starting kubernetes: preparing server: post join: a configuration change is already in progress (5)"` - // ... by scanning for this line in logs and restarting the container in case it appears - log.Debugf("Starting to wait for loadbalancer node '%s'", serverlb.Name) - readyLogMessage := k3d.ReadyLogMessageByRole[k3d.LoadBalancerRole] - if readyLogMessage != "" { - if err := NodeWaitForLogMessage(ctx, runtime, serverlb, readyLogMessage, start); err != nil { - return fmt.Errorf("Loadbalancer '%s' failed to get ready: %+v", serverlb.Name, err) - } - } else { - log.Warnf("ClusterStart: Set to wait for node %s to be ready, but there's no target log message defined", serverlb.Name) - } - } else { - log.Infof("Serverlb '%s' already running", serverlb.Name) + /* + * Agent Nodes + */ + + failedAgents := 0 + + log.Infoln("Starting agents...") + for _, agentNode := range agents { + if err := NodeStart(ctx, runtime, agentNode, nodeStartOpts); err != nil { + log.Warnf("Failed to start agent %s: %+v", agentNode.Name, err) + failedAgents++ } } - if failed > 0 { - return fmt.Errorf("Failed to start %d nodes: Try to start them manually", failed) + /* + * Auxiliary/Helper Nodes + */ + + log.Infoln("Starting helpers...") + failedHelpers := 0 + for _, helperNode := range aux { + nodeStartOpts := k3d.NodeStartOpts{} + if helperNode.Role == k3d.LoadBalancerRole { + nodeStartOpts.Wait = true + } + if err := NodeStart(ctx, runtime, helperNode, nodeStartOpts); err != nil { + log.Warnf("Failed to start helper %s: %+v", helperNode.Name, err) + failedHelpers++ + } } + + if failedAgents+failedHelpers > 0 { + log.Warnf("%d non-critical (agent or helper) nodes failed to start. You may want to start them manually.", failedAgents+failedHelpers) + } + return nil } diff --git a/pkg/client/node.go b/pkg/client/node.go index bf420390..2a2f732e 100644 --- a/pkg/client/node.go +++ b/pkg/client/node.go @@ -129,6 +129,10 @@ func NodeAddToCluster(ctx context.Context, runtime runtimes.Runtime, node *k3d.N } } + // clear status fields + node.State.Running = false + node.State.Status = "" + if err := NodeRun(ctx, runtime, node, k3d.NodeCreateOpts{}); err != nil { return err } @@ -233,6 +237,14 @@ func NodeRun(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, node // NodeStart starts an existing node func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, nodeStartOpts k3d.NodeStartOpts) error { + + // return early, if the node is already running + if node.State.Running { + log.Infof("Node %s is already running", node.Name) + return nil + } + + // execute lifecycle hook actions for _, hook := range nodeStartOpts.NodeHooks { if hook.Stage == k3d.LifecycleStagePreStart { log.Tracef("Node %s: Executing preStartAction '%s'", node.Name, reflect.TypeOf(hook)) @@ -241,6 +253,8 @@ func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, no } } } + + // start the node log.Tracef("Starting node '%s'", node.Name) startTime := time.Now() @@ -250,10 +264,12 @@ func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, no } if nodeStartOpts.Wait { - log.Debugf("Waiting for node %s to get ready", node.Name) - readyLogMessage := k3d.ReadyLogMessageByRole[node.Role] - if readyLogMessage != "" { - if err := NodeWaitForLogMessage(ctx, runtime, node, readyLogMessage, startTime); err != nil { + if nodeStartOpts.ReadyLogMessage == "" { + nodeStartOpts.ReadyLogMessage = k3d.ReadyLogMessageByRole[node.Role] + } + if nodeStartOpts.ReadyLogMessage != "" { + log.Debugf("Waiting for node %s to get ready (Log: '%s')", node.Name, nodeStartOpts.ReadyLogMessage) + if err := NodeWaitForLogMessage(ctx, runtime, node, nodeStartOpts.ReadyLogMessage, startTime); err != nil { return fmt.Errorf("Node %s failed to get ready: %+v", node.Name, err) } } else { diff --git a/pkg/types/types.go b/pkg/types/types.go index 34c7b490..936f16a2 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -224,9 +224,10 @@ type NodeCreateOpts struct { // NodeStartOpts describes a set of options one can set when (re-)starting a node type NodeStartOpts struct { - Wait bool - Timeout time.Duration - NodeHooks []NodeHook `yaml:"nodeHooks,omitempty" json:"nodeHooks,omitempty"` + Wait bool + Timeout time.Duration + NodeHooks []NodeHook `yaml:"nodeHooks,omitempty" json:"nodeHooks,omitempty"` + ReadyLogMessage string } // NodeDeleteOpts describes a set of options one can set when deleting a node