clear leftover fixmes on config options and goto replacement
This commit is contained in:
parent
7cd08f247e
commit
6e8b1663ce
@ -31,6 +31,7 @@ import (
|
|||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -671,24 +672,19 @@ func NodeGet(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node) (*k3
|
|||||||
// NodeWaitForLogMessage follows the logs of a node container and returns if it finds a specific line in there (or timeout is reached)
|
// NodeWaitForLogMessage follows the logs of a node container and returns if it finds a specific line in there (or timeout is reached)
|
||||||
func NodeWaitForLogMessage(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, message string, since time.Time) error {
|
func NodeWaitForLogMessage(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, message string, since time.Time) error {
|
||||||
l.Log().Tracef("NodeWaitForLogMessage: Node '%s' waiting for log message '%s' since '%+v'", node.Name, message, since)
|
l.Log().Tracef("NodeWaitForLogMessage: Node '%s' waiting for log message '%s' since '%+v'", node.Name, message, since)
|
||||||
maxrestarts := 10 // FIXME: replace hard-coded value with config/default value
|
|
||||||
restarted := 0
|
|
||||||
restartWaiting:
|
|
||||||
found := false
|
|
||||||
// read the logs
|
|
||||||
out, err := runtime.GetNodeLogs(ctx, node, since, &runtimeTypes.NodeLogsOpts{Follow: true})
|
|
||||||
if out != nil {
|
|
||||||
defer out.Close()
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("Failed waiting for log message '%s' from node '%s': %w", message, node.Name, err)
|
|
||||||
}
|
|
||||||
scanner := bufio.NewScanner(out)
|
|
||||||
var previousline string
|
|
||||||
|
|
||||||
|
// specify max number of retries if container is in crashloop (as defined by last seen message being a fatal log)
|
||||||
|
backOffLimit := k3d.DefaultNodeWaitForLogMessageCrashLoopBackOffLimit
|
||||||
|
if l, ok := os.LookupEnv(k3d.K3dEnvDebugNodeWaitBackOffLimit); ok {
|
||||||
|
limit, err := strconv.Atoi(l)
|
||||||
|
if err == nil {
|
||||||
|
backOffLimit = limit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// start a goroutine to print a warning continuously if a node is restarting for quite some time already
|
||||||
donechan := make(chan struct{})
|
donechan := make(chan struct{})
|
||||||
defer close(donechan)
|
defer close(donechan)
|
||||||
|
|
||||||
go func(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, since time.Time, donechan chan struct{}) {
|
go func(ctx context.Context, runtime runtimes.Runtime, node *k3d.Node, since time.Time, donechan chan struct{}) {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
@ -708,6 +704,25 @@ restartWaiting:
|
|||||||
|
|
||||||
}(ctx, runtime, node, since, donechan)
|
}(ctx, runtime, node, since, donechan)
|
||||||
|
|
||||||
|
// Start loop to check log stream for specified log message.
|
||||||
|
// We're looping here, as sometimes the containers run into a crash loop, but *may* recover from that
|
||||||
|
// e.g. when a new server is joining an existing cluster and has to wait for another member to finish learning.
|
||||||
|
// The logstream returned by docker ends everytime the container restarts, so we have to start from the beginning.
|
||||||
|
for i := 0; i < backOffLimit; i++ {
|
||||||
|
|
||||||
|
// get the log stream (reader is following the logstream)
|
||||||
|
out, err := runtime.GetNodeLogs(ctx, node, since, &runtimeTypes.NodeLogsOpts{Follow: true})
|
||||||
|
if out != nil {
|
||||||
|
defer out.Close()
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("Failed waiting for log message '%s' from node '%s': %w", message, node.Name, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We're scanning the logstream continuously line-by-line
|
||||||
|
scanner := bufio.NewScanner(out)
|
||||||
|
var previousline string
|
||||||
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
@ -728,25 +743,29 @@ restartWaiting:
|
|||||||
// check if we can find the specified line in the log
|
// check if we can find the specified line in the log
|
||||||
if strings.Contains(scanner.Text(), message) {
|
if strings.Contains(scanner.Text(), message) {
|
||||||
l.Log().Tracef("Found target message `%s` in log line `%s`", message, scanner.Text())
|
l.Log().Tracef("Found target message `%s` in log line `%s`", message, scanner.Text())
|
||||||
found = true
|
l.Log().Debugf("Finished waiting for log message '%s' from node '%s'", message, node.Name)
|
||||||
break
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
previousline = scanner.Text()
|
previousline = scanner.Text()
|
||||||
|
|
||||||
}
|
}
|
||||||
if restarted < maxrestarts && strings.Contains(previousline, "level=fatal") { // FIXME: replace hard-coded
|
|
||||||
restarted++
|
out.Close() // no more input on scanner, but target log not yet found -> close current logreader (precautionary)
|
||||||
l.Log().Warnf("warning: encountered fatal log from node %s (retrying %d/%d): %s", node.Name, restarted, maxrestarts, previousline)
|
|
||||||
|
// we got here, because the logstream ended (no more input on scanner), so we check if maybe the container crashed
|
||||||
|
if strings.Contains(previousline, "level=fatal") {
|
||||||
|
// case 1: last log line we saw contained a fatal error, so probably it crashed and we want to retry on restart
|
||||||
|
l.Log().Warnf("warning: encountered fatal log from node %s (retrying %d/%d): %s", node.Name, i, backOffLimit, previousline)
|
||||||
out.Close()
|
out.Close()
|
||||||
time.Sleep(500 * time.Millisecond)
|
time.Sleep(500 * time.Millisecond)
|
||||||
goto restartWaiting // FIXME: replace goto with loop
|
continue
|
||||||
|
} else {
|
||||||
|
// case 2: last log line we saw did not contain a fatal error, so we break the loop here and return a generic error
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if !found {
|
|
||||||
return fmt.Errorf("error waiting for log line `%s` from node '%s': stopped returning log lines", message, node.Name)
|
return fmt.Errorf("error waiting for log line `%s` from node '%s': stopped returning log lines", message, node.Name)
|
||||||
}
|
|
||||||
l.Log().Debugf("Finished waiting for log message '%s' from node '%s'", message, node.Name)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeFilterByRoles filters a list of nodes by their roles
|
// NodeFilterByRoles filters a list of nodes by their roles
|
||||||
|
@ -89,3 +89,8 @@ const DefaultAPIHost = "0.0.0.0"
|
|||||||
func GetDefaultObjectName(name string) string {
|
func GetDefaultObjectName(name string) string {
|
||||||
return fmt.Sprintf("%s-%s", DefaultObjectNamePrefix, name)
|
return fmt.Sprintf("%s-%s", DefaultObjectNamePrefix, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DefaultNodeWaitForLogMessageCrashLoopBackOffLimit defines the maximum number of retries to find the target log message, if the
|
||||||
|
// container is in a crash loop.
|
||||||
|
// This makes sense e.g. when a new server is waiting to join an existing cluster and has to wait for other learners to finish.
|
||||||
|
const DefaultNodeWaitForLogMessageCrashLoopBackOffLimit = 10
|
||||||
|
@ -34,6 +34,7 @@ const (
|
|||||||
// Debug options
|
// Debug options
|
||||||
K3dEnvDebugCorednsRetries = "K3D_DEBUG_COREDNS_RETRIES"
|
K3dEnvDebugCorednsRetries = "K3D_DEBUG_COREDNS_RETRIES"
|
||||||
K3dEnvDebugDisableDockerInit = "K3D_DEBUG_DISABLE_DOCKER_INIT"
|
K3dEnvDebugDisableDockerInit = "K3D_DEBUG_DISABLE_DOCKER_INIT"
|
||||||
|
K3dEnvDebugNodeWaitBackOffLimit = "K3D_DEBUG_NODE_WAIT_BACKOFF_LIMIT"
|
||||||
|
|
||||||
// Fixes
|
// Fixes
|
||||||
K3dEnvFixCgroupV2 = "K3D_FIX_CGROUPV2"
|
K3dEnvFixCgroupV2 = "K3D_FIX_CGROUPV2"
|
||||||
|
Loading…
Reference in New Issue
Block a user