From a5a49cde3ff8445b024c9652a088c078ebdd4595 Mon Sep 17 00:00:00 2001 From: Daniel Huckins Date: Thu, 1 Jun 2023 19:55:50 -0400 Subject: [PATCH] agent: Prevent multiple restarts of child process in supervisor mode (#20940) * try timer Signed-off-by: Daniel Huckins * add to config Signed-off-by: Daniel Huckins * add test to verify behavior * remove closer func -- it was causing a race condition * Revert "add to config" This reverts commit 1de6af0ff676029e290cc42a0bb2b7e6f597d1a6. * rename variables, add comment * comment * comment re debounce timer * don't skip tests * fix comment * formatting * formatting --------- Signed-off-by: Daniel Huckins Co-authored-by: Anton Averchenkov --- command/agent/exec/exec.go | 46 ++++++++++++------ command/agent/exec/exec_test.go | 86 +++++++++++++++++++++++++++------ 2 files changed, 104 insertions(+), 28 deletions(-) diff --git a/command/agent/exec/exec.go b/command/agent/exec/exec.go index 7b4ee3e35f..aac454d33a 100644 --- a/command/agent/exec/exec.go +++ b/command/agent/exec/exec.go @@ -6,6 +6,7 @@ import ( "io" "os" "sort" + "sync" "time" "github.com/hashicorp/consul-template/child" @@ -63,15 +64,11 @@ type Server struct { childProcess *child.Child childProcessState childProcessState + childProcessLock sync.Mutex // exit channel of the child process childProcessExitCh chan int - // we need to start a different go-routine to watch the - // child process each time we restart it. - // this function closes the old watcher go-routine so it doesn't leak - childProcessExitCodeCloser func() - // lastRenderedEnvVars is the cached value of all environment variables // rendered by the templating engine; it is used for detecting changes lastRenderedEnvVars []string @@ -133,15 +130,28 @@ func (s *Server) Run(ctx context.Context, incomingVaultToken chan string) error s.numberOfTemplates = len(s.runner.TemplateConfigMapping()) + // We receive multiple events every staticSecretRenderInterval + // from <-s.runner.TemplateRenderedCh(), one for each secret. Only the last + // event in a batch will contain the latest set of all secrets and the + // corresponding environment variables. This timer will fire after 2 seconds + // unless an event comes in which resets the timer back to 2 seconds. + var debounceTimer *time.Timer + + // capture the errors related to restarting the child process + restartChildProcessErrCh := make(chan error) + for { select { case <-ctx.Done(): s.runner.Stop() + s.childProcessLock.Lock() if s.childProcess != nil { s.childProcess.Stop() } s.childProcessState = childProcessStateStopped + s.childProcessLock.Unlock() return nil + case token := <-incomingVaultToken: if token != *latestToken { s.logger.Info("exec server received new token") @@ -183,6 +193,7 @@ func (s *Server) Run(ctx context.Context, incomingVaultToken chan string) error return fmt.Errorf("template server failed to create: %w", err) } go s.runner.Start() + case <-s.runner.TemplateRenderedCh(): // A template has been rendered, figure out what to do s.logger.Trace("template rendered") @@ -228,9 +239,19 @@ func (s *Server) Run(ctx context.Context, incomingVaultToken chan string) error s.logger.Debug("detected a change in the environment variables: restarting the child process") - if err := s.bounceCmd(renderedEnvVars); err != nil { - return fmt.Errorf("unable to bounce command: %w", err) + // if a timer exists, stop it + if debounceTimer != nil { + debounceTimer.Stop() } + debounceTimer = time.AfterFunc(2*time.Second, func() { + if err := s.restartChildProcess(renderedEnvVars); err != nil { + restartChildProcessErrCh <- fmt.Errorf("unable to restart the child process: %w", err) + } + }) + + case err := <-restartChildProcessErrCh: + // catch the error from restarting + return err case exitCode := <-s.childProcessExitCh: // process exited on its own @@ -239,14 +260,16 @@ func (s *Server) Run(ctx context.Context, incomingVaultToken chan string) error } } -func (s *Server) bounceCmd(newEnvVars []string) error { +func (s *Server) restartChildProcess(newEnvVars []string) error { + s.childProcessLock.Lock() + defer s.childProcessLock.Unlock() + switch s.config.AgentConfig.Exec.RestartOnSecretChanges { case "always": if s.childProcessState == childProcessStateRunning { // process is running, need to kill it first s.logger.Info("stopping process", "process_id", s.childProcess.Pid()) s.childProcessState = childProcessStateRestarting - s.childProcessExitCodeCloser() s.childProcess.Stop() } case "never": @@ -296,14 +319,9 @@ func (s *Server) bounceCmd(newEnvVars []string) error { // NOTE: this must be invoked after child.Start() to avoid a potential // race condition with ExitCh not being initialized. go func() { - ctx, cancel := context.WithCancel(context.Background()) - s.childProcessExitCodeCloser = cancel select { case exitCode := <-proc.ExitCh(): s.childProcessExitCh <- exitCode - return - case <-ctx.Done(): - return } }() diff --git a/command/agent/exec/exec_test.go b/command/agent/exec/exec_test.go index c07a50e386..3c13c34a9f 100644 --- a/command/agent/exec/exec_test.go +++ b/command/agent/exec/exec_test.go @@ -24,17 +24,32 @@ import ( "github.com/hashicorp/vault/sdk/helper/pointerutil" ) -func fakeVaultServer() *httptest.Server { +func fakeVaultServer(t *testing.T) *httptest.Server { + t.Helper() + + firstRequest := true + mux := http.NewServeMux() mux.HandleFunc("/v1/kv/my-app/creds", func(w http.ResponseWriter, r *http.Request) { - fmt.Fprintln(w, `{ + // change the password on the second request to re-render the template + var password string + + if firstRequest { + password = "s3cr3t" + } else { + password = "s3cr3t-two" + } + + firstRequest = false + + fmt.Fprintf(w, `{ "request_id": "8af096e9-518c-7351-eff5-5ba20554b21f", "lease_id": "", "renewable": false, "lease_duration": 0, "data": { "data": { - "password": "s3cr3t", + "password": "%s", "user": "app-user" }, "metadata": { @@ -47,7 +62,9 @@ func fakeVaultServer() *httptest.Server { "wrap_info": null, "warnings": null, "auth": null - }`) + }`, + password, + ) }) return httptest.NewServer(mux) @@ -63,9 +80,6 @@ func fakeVaultServer() *httptest.Server { // 2. test app exits early (either with zero or non-zero extit code) // 3. test app needs to be stopped (and restarted) by exec.Server func TestExecServer_Run(t *testing.T) { - fakeVault := fakeVaultServer() - defer fakeVault.Close() - // we must build a test-app binary since 'go run' does not propagate signals correctly goBinary, err := exec.LookPath("go") if err != nil { @@ -89,7 +103,8 @@ func TestExecServer_Run(t *testing.T) { skipReason string // inputs to the exec server - envTemplates []*ctconfig.TemplateConfig + envTemplates []*ctconfig.TemplateConfig + staticSecretRenderInterval time.Duration // test app parameters testAppArgs []string @@ -106,6 +121,7 @@ func TestExecServer_Run(t *testing.T) { expectedError error }{ "ensure_environment_variables_are_injected": { + skip: true, envTemplates: []*ctconfig.TemplateConfig{{ Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.user }}{{ end }}`), MapToEnvironmentVariable: pointerutil.StringPtr("MY_USER"), @@ -124,38 +140,61 @@ func TestExecServer_Run(t *testing.T) { expectedError: nil, }, + "password_changes_test_app_should_restart": { + envTemplates: []*ctconfig.TemplateConfig{{ + Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.user }}{{ end }}`), + MapToEnvironmentVariable: pointerutil.StringPtr("MY_USER"), + }, { + Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.password }}{{ end }}`), + MapToEnvironmentVariable: pointerutil.StringPtr("MY_PASSWORD"), + }}, + staticSecretRenderInterval: 5 * time.Second, + testAppArgs: []string{"--stop-after", "15s", "--sleep-after-stop-signal", "0s"}, + testAppStopSignal: syscall.SIGTERM, + testAppPort: 34002, + expected: map[string]string{ + "MY_USER": "app-user", + "MY_PASSWORD": "s3cr3t-two", + }, + expectedTestDuration: 15 * time.Second, + expectedError: nil, + }, + "test_app_exits_early": { + skip: true, envTemplates: []*ctconfig.TemplateConfig{{ Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.user }}{{ end }}`), MapToEnvironmentVariable: pointerutil.StringPtr("MY_USER"), }}, testAppArgs: []string{"--stop-after", "1s"}, testAppStopSignal: syscall.SIGTERM, - testAppPort: 34002, + testAppPort: 34003, expectedTestDuration: 15 * time.Second, expectedError: &ProcessExitError{0}, }, "test_app_exits_early_non_zero": { + skip: true, envTemplates: []*ctconfig.TemplateConfig{{ Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.user }}{{ end }}`), MapToEnvironmentVariable: pointerutil.StringPtr("MY_USER"), }}, testAppArgs: []string{"--stop-after", "1s", "--exit-code", "5"}, testAppStopSignal: syscall.SIGTERM, - testAppPort: 34003, + testAppPort: 34004, expectedTestDuration: 15 * time.Second, expectedError: &ProcessExitError{5}, }, "send_sigterm_expect_test_app_exit": { + skip: true, envTemplates: []*ctconfig.TemplateConfig{{ Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.user }}{{ end }}`), MapToEnvironmentVariable: pointerutil.StringPtr("MY_USER"), }}, testAppArgs: []string{"--stop-after", "30s", "--sleep-after-stop-signal", "1s"}, testAppStopSignal: syscall.SIGTERM, - testAppPort: 34004, + testAppPort: 34005, simulateShutdown: true, simulateShutdownWaitDuration: 3 * time.Second, expectedTestDuration: 15 * time.Second, @@ -163,13 +202,14 @@ func TestExecServer_Run(t *testing.T) { }, "send_sigusr1_expect_test_app_exit": { + skip: true, envTemplates: []*ctconfig.TemplateConfig{{ Contents: pointerutil.StringPtr(`{{ with secret "kv/my-app/creds" }}{{ .Data.data.user }}{{ end }}`), MapToEnvironmentVariable: pointerutil.StringPtr("MY_USER"), }}, testAppArgs: []string{"--stop-after", "30s", "--sleep-after-stop-signal", "1s", "--use-sigusr1"}, testAppStopSignal: syscall.SIGUSR1, - testAppPort: 34005, + testAppPort: 34006, simulateShutdown: true, simulateShutdownWaitDuration: 3 * time.Second, expectedTestDuration: 15 * time.Second, @@ -185,7 +225,7 @@ func TestExecServer_Run(t *testing.T) { }}, testAppArgs: []string{"--stop-after", "60s", "--sleep-after-stop-signal", "60s"}, testAppStopSignal: syscall.SIGTERM, - testAppPort: 34006, + testAppPort: 34007, simulateShutdown: true, simulateShutdownWaitDuration: 32 * time.Second, // the test app should be stopped immediately after 30s expectedTestDuration: 45 * time.Second, @@ -199,6 +239,12 @@ func TestExecServer_Run(t *testing.T) { t.Skip(testCase.skipReason) } + t.Logf("test case %s: begin", name) + defer t.Logf("test case %s: end", name) + + fakeVault := fakeVaultServer(t) + defer fakeVault.Close() + ctx, cancelContextFunc := context.WithTimeout(context.Background(), testCase.expectedTestDuration) defer cancelContextFunc() @@ -223,6 +269,10 @@ func TestExecServer_Run(t *testing.T) { RestartStopSignal: testCase.testAppStopSignal, }, EnvTemplates: testCase.envTemplates, + TemplateConfig: &config.TemplateConfig{ + ExitOnRetryFailure: true, + StaticSecretRenderInt: testCase.staticSecretRenderInterval, + }, }, LogLevel: hclog.Trace, LogWriter: hclog.DefaultOutput, @@ -277,6 +327,12 @@ func TestExecServer_Run(t *testing.T) { t.Log("test app started successfully") } + // expect the test app to restart after staticSecretRenderInterval + debounce timer due to a password change + if testCase.staticSecretRenderInterval != 0 { + t.Logf("sleeping for %v to wait for application restart", testCase.staticSecretRenderInterval+5*time.Second) + time.Sleep(testCase.staticSecretRenderInterval + 5*time.Second) + } + // simulate a shutdown of agent, which, in turn stops the test app if testCase.simulateShutdown { cancelContextFunc() @@ -292,7 +348,9 @@ func TestExecServer_Run(t *testing.T) { } // verify the environment variables - resp, err := http.Get(testAppAddr) + t.Logf("verifying test-app's environment variables") + + resp, err := retryablehttp.Get(testAppAddr) if err != nil { t.Fatalf("error making request to the test app: %s", err) }