clusterStart: sequential and ordered node starts & nodeCreate: do not copy status

This commit is contained in:
iwilltry42 2021-01-28 20:44:23 +01:00
parent b2162b1618
commit 1c61130fdf
No known key found for this signature in database
GPG Key ID: 7BA57AD1CFF16110
4 changed files with 104 additions and 78 deletions

View File

@ -50,7 +50,7 @@ func NewCmdNodeCreate() *cobra.Command {
nodes, cluster := parseCreateNodeCmd(cmd, args) nodes, cluster := parseCreateNodeCmd(cmd, args)
if err := k3dc.NodeAddToClusterMulti(cmd.Context(), runtimes.SelectedRuntime, nodes, cluster, createNodeOpts); err != nil { if err := k3dc.NodeAddToClusterMulti(cmd.Context(), runtimes.SelectedRuntime, nodes, cluster, createNodeOpts); err != nil {
log.Errorf("Failed to add nodes to cluster '%s'", cluster.Name) log.Errorf("Failed to add nodes to cluster '%s'", cluster.Name)
log.Errorln(err) log.Fatalln(err)
} }
}, },
} }

View File

@ -802,97 +802,106 @@ func generateNodeName(cluster string, role k3d.Role, suffix int) string {
func ClusterStart(ctx context.Context, runtime k3drt.Runtime, cluster *k3d.Cluster, startClusterOpts types.ClusterStartOpts) error { func ClusterStart(ctx context.Context, runtime k3drt.Runtime, cluster *k3d.Cluster, startClusterOpts types.ClusterStartOpts) error {
log.Infof("Starting cluster '%s'", cluster.Name) log.Infof("Starting cluster '%s'", cluster.Name)
start := time.Now()
if startClusterOpts.Timeout > 0*time.Second { if startClusterOpts.Timeout > 0*time.Second {
var cancel context.CancelFunc var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, startClusterOpts.Timeout) ctx, cancel = context.WithTimeout(ctx, startClusterOpts.Timeout)
defer cancel() defer cancel()
} }
// sort the nodes into categories
var initNode *k3d.Node
var servers []*k3d.Node
var agents []*k3d.Node
var aux []*k3d.Node
for _, n := range cluster.Nodes {
if n.Role == k3d.ServerRole {
if n.ServerOpts.IsInit {
initNode = n
continue
}
servers = append(servers, n)
} else if n.Role == k3d.AgentRole {
agents = append(agents, n)
} else {
aux = append(aux, n)
}
}
log.Infoln("Servers before sort:")
for i, n := range servers {
log.Infof("Server %d - %s", i, n.Name)
}
sort.Slice(servers, func(i, j int) bool {
return servers[i].Name < servers[j].Name
})
log.Infoln("Servers after sort:")
for i, n := range servers {
log.Infof("Server %d - %s", i, n.Name)
}
/* /*
* Init Node * Init Node
*/ */
for _, n := range cluster.Nodes { if initNode != nil {
if n.Role == k3d.ServerRole && n.ServerOpts.IsInit { log.Infoln("Starting the initializing server...")
if err := NodeStart(ctx, runtime, n, k3d.NodeStartOpts{ if err := NodeStart(ctx, runtime, initNode, k3d.NodeStartOpts{
Wait: true, // always wait for the init node Wait: true, // always wait for the init node
NodeHooks: startClusterOpts.NodeHooks, NodeHooks: startClusterOpts.NodeHooks,
}); err != nil { ReadyLogMessage: "Running kube-apiserver", // initNode means, that we're using etcd -> this will need quorum, so "k3s is up and running" won't happen right now
return fmt.Errorf("Failed to start initializing server node: %+v", err) }); err != nil {
} return fmt.Errorf("Failed to start initializing server node: %+v", err)
break
} }
} }
/* /*
* Other Nodes * Server Nodes
*/ */
failed := 0 log.Infoln("Starting servers...")
var serverlb *k3d.Node nodeStartOpts := k3d.NodeStartOpts{
for _, node := range cluster.Nodes { Wait: true,
NodeHooks: startClusterOpts.NodeHooks,
// skip the LB, because we want to start it last }
if node.Role == k3d.LoadBalancerRole { for _, serverNode := range servers {
serverlb = node if err := NodeStart(ctx, runtime, serverNode, nodeStartOpts); err != nil {
continue return fmt.Errorf("Failed to start server %s: %+v", serverNode.Name, err)
}
// skip init node here, as it should be running already
if node == cluster.InitNode || node.ServerOpts.IsInit {
continue
}
// check if node is running already to avoid waiting forever when checking for the node log message
if !node.State.Running {
nodeStartOpts := k3d.NodeStartOpts{
NodeHooks: startClusterOpts.NodeHooks,
}
if node.Role == k3d.ServerRole && startClusterOpts.WaitForServer {
nodeStartOpts.Wait = true
}
// start node
if err := NodeStart(ctx, runtime, node, nodeStartOpts); err != nil {
log.Warningf("Failed to start node '%s': Try to start it manually", node.Name)
failed++
continue
}
} else {
log.Infof("Node '%s' already running", node.Name)
} }
} }
// start serverlb /*
if serverlb != nil { * Agent Nodes
if !serverlb.State.Running { */
log.Debugln("Starting serverlb...")
if err := runtime.StartNode(ctx, serverlb); err != nil { // FIXME: we could run into a nullpointer exception here failedAgents := 0
log.Warningf("Failed to start serverlb '%s' (try to start it manually): %+v", serverlb.Name, err)
failed++ log.Infoln("Starting agents...")
} for _, agentNode := range agents {
// TODO: avoid `level=fatal msg="starting kubernetes: preparing server: post join: a configuration change is already in progress (5)"` if err := NodeStart(ctx, runtime, agentNode, nodeStartOpts); err != nil {
// ... by scanning for this line in logs and restarting the container in case it appears log.Warnf("Failed to start agent %s: %+v", agentNode.Name, err)
log.Debugf("Starting to wait for loadbalancer node '%s'", serverlb.Name) failedAgents++
readyLogMessage := k3d.ReadyLogMessageByRole[k3d.LoadBalancerRole]
if readyLogMessage != "" {
if err := NodeWaitForLogMessage(ctx, runtime, serverlb, readyLogMessage, start); err != nil {
return fmt.Errorf("Loadbalancer '%s' failed to get ready: %+v", serverlb.Name, err)
}
} else {
log.Warnf("ClusterStart: Set to wait for node %s to be ready, but there's no target log message defined", serverlb.Name)
}
} else {
log.Infof("Serverlb '%s' already running", serverlb.Name)
} }
} }
if failed > 0 { /*
return fmt.Errorf("Failed to start %d nodes: Try to start them manually", failed) * Auxiliary/Helper Nodes
*/
log.Infoln("Starting helpers...")
failedHelpers := 0
for _, helperNode := range aux {
nodeStartOpts := k3d.NodeStartOpts{}
if helperNode.Role == k3d.LoadBalancerRole {
nodeStartOpts.Wait = true
}
if err := NodeStart(ctx, runtime, helperNode, nodeStartOpts); err != nil {
log.Warnf("Failed to start helper %s: %+v", helperNode.Name, err)
failedHelpers++
}
} }
if failedAgents+failedHelpers > 0 {
log.Warnf("%d non-critical (agent or helper) nodes failed to start. You may want to start them manually.", failedAgents+failedHelpers)
}
return nil return nil
} }

View File

@ -129,6 +129,10 @@ func NodeAddToCluster(ctx context.Context, runtime runtimes.Runtime, node *k3d.N
} }
} }
// clear status fields
node.State.Running = false
node.State.Status = ""
if err := NodeRun(ctx, runtime, node, k3d.NodeCreateOpts{}); err != nil { if err := NodeRun(ctx, runtime, node, k3d.NodeCreateOpts{}); err != nil {
return err return err
} }
@ -233,6 +237,14 @@ func NodeRun(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, node
// NodeStart starts an existing node // NodeStart starts an existing node
func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, nodeStartOpts k3d.NodeStartOpts) error { func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, nodeStartOpts k3d.NodeStartOpts) error {
// return early, if the node is already running
if node.State.Running {
log.Infof("Node %s is already running", node.Name)
return nil
}
// execute lifecycle hook actions
for _, hook := range nodeStartOpts.NodeHooks { for _, hook := range nodeStartOpts.NodeHooks {
if hook.Stage == k3d.LifecycleStagePreStart { if hook.Stage == k3d.LifecycleStagePreStart {
log.Tracef("Node %s: Executing preStartAction '%s'", node.Name, reflect.TypeOf(hook)) log.Tracef("Node %s: Executing preStartAction '%s'", node.Name, reflect.TypeOf(hook))
@ -241,6 +253,8 @@ func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, no
} }
} }
} }
// start the node
log.Tracef("Starting node '%s'", node.Name) log.Tracef("Starting node '%s'", node.Name)
startTime := time.Now() startTime := time.Now()
@ -250,10 +264,12 @@ func NodeStart(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, no
} }
if nodeStartOpts.Wait { if nodeStartOpts.Wait {
log.Debugf("Waiting for node %s to get ready", node.Name) if nodeStartOpts.ReadyLogMessage == "" {
readyLogMessage := k3d.ReadyLogMessageByRole[node.Role] nodeStartOpts.ReadyLogMessage = k3d.ReadyLogMessageByRole[node.Role]
if readyLogMessage != "" { }
if err := NodeWaitForLogMessage(ctx, runtime, node, readyLogMessage, startTime); err != nil { if nodeStartOpts.ReadyLogMessage != "" {
log.Debugf("Waiting for node %s to get ready (Log: '%s')", node.Name, nodeStartOpts.ReadyLogMessage)
if err := NodeWaitForLogMessage(ctx, runtime, node, nodeStartOpts.ReadyLogMessage, startTime); err != nil {
return fmt.Errorf("Node %s failed to get ready: %+v", node.Name, err) return fmt.Errorf("Node %s failed to get ready: %+v", node.Name, err)
} }
} else { } else {

View File

@ -224,9 +224,10 @@ type NodeCreateOpts struct {
// NodeStartOpts describes a set of options one can set when (re-)starting a node // NodeStartOpts describes a set of options one can set when (re-)starting a node
type NodeStartOpts struct { type NodeStartOpts struct {
Wait bool Wait bool
Timeout time.Duration Timeout time.Duration
NodeHooks []NodeHook `yaml:"nodeHooks,omitempty" json:"nodeHooks,omitempty"` NodeHooks []NodeHook `yaml:"nodeHooks,omitempty" json:"nodeHooks,omitempty"`
ReadyLogMessage string
} }
// NodeDeleteOpts describes a set of options one can set when deleting a node // NodeDeleteOpts describes a set of options one can set when deleting a node