mirror of
				https://github.com/siderolabs/talos.git
				synced 2025-10-31 00:11:36 +01:00 
			
		
		
		
	fix: stop etcd on any path on upgrade
The problem was that etcd stop was only happening in `LeaveEtcd`, thus upgrade with preserve was never stopping etcd leaving ephemeral partition still busy. Refactored code which was stopping service, shutting down all the services to provide the interface we need: * stop a service without considering reverse dependencies (force); * stop a service (services) waiting for reverse dependencies; * shutdown all the services waiting for reverse dependencies. Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
This commit is contained in:
		
							parent
							
								
									a7a27e7edd
								
							
						
					
					
						commit
						e7f6344d97
					
				| @ -667,13 +667,7 @@ func StartAllServices(seq runtime.Sequence, data interface{}) (runtime.TaskExecu | |||||||
| // StopServicesForUpgrade represents the StopServicesForUpgrade task. | // StopServicesForUpgrade represents the StopServicesForUpgrade task. | ||||||
| func StopServicesForUpgrade(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) { | func StopServicesForUpgrade(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) { | ||||||
| 	return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) { | 	return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) { | ||||||
| 		for _, service := range []string{"kubelet", "cri", "udevd"} { | 		return system.Services(nil).StopWithRevDepenencies(ctx, "cri", "etcd", "kubelet", "udevd") | ||||||
| 			if err = system.Services(nil).Stop(ctx, service); err != nil { |  | ||||||
| 				return err |  | ||||||
| 			} |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		return nil |  | ||||||
| 	}, "stopServicesForUpgrade" | 	}, "stopServicesForUpgrade" | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -1186,7 +1180,7 @@ func LeaveEtcd(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFun | |||||||
| // RemoveAllPods represents the task for stopping all pods. | // RemoveAllPods represents the task for stopping all pods. | ||||||
| func RemoveAllPods(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) { | func RemoveAllPods(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) { | ||||||
| 	return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) { | 	return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) { | ||||||
| 		if err = system.Services(nil).Stop(context.Background(), "kubelet"); err != nil { | 		if err = system.Services(nil).Stop(ctx, "kubelet"); err != nil { | ||||||
| 			return err | 			return err | ||||||
| 		} | 		} | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -7,6 +7,7 @@ package system | |||||||
| import ( | import ( | ||||||
| 	"context" | 	"context" | ||||||
| 	"fmt" | 	"fmt" | ||||||
|  | 	"log" | ||||||
| 	"sort" | 	"sort" | ||||||
| 	"sync" | 	"sync" | ||||||
| 	"time" | 	"time" | ||||||
| @ -202,20 +203,71 @@ func (s *singleton) Shutdown() { | |||||||
| 		return | 		return | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	stateCopy := make(map[string]*ServiceRunner) |  | ||||||
| 	s.terminating = true | 	s.terminating = true | ||||||
| 
 | 
 | ||||||
| 	for name, svcrunner := range s.state { | 	_ = s.stopServices(context.Background(), nil, true) //nolint: errcheck | ||||||
| 		stateCopy[name] = svcrunner | } | ||||||
|  | 
 | ||||||
|  | // Stop will initiate a shutdown of the specified service. | ||||||
|  | func (s *singleton) Stop(ctx context.Context, serviceIDs ...string) (err error) { | ||||||
|  | 	if len(serviceIDs) == 0 { | ||||||
|  | 		return | ||||||
| 	} | 	} | ||||||
|  | 
 | ||||||
|  | 	s.mu.Lock() | ||||||
|  | 	if s.terminating { | ||||||
|  | 		s.mu.Unlock() | ||||||
|  | 		return nil | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return s.stopServices(ctx, serviceIDs, false) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // StopWithRevDepenencies will initiate a shutdown of the specified services waiting for reverse dependencies to finish first. | ||||||
|  | // | ||||||
|  | // If reverse dependency is not stopped, this method might block waiting on it being stopped forever. | ||||||
|  | func (s *singleton) StopWithRevDepenencies(ctx context.Context, serviceIDs ...string) (err error) { | ||||||
|  | 	if len(serviceIDs) == 0 { | ||||||
|  | 		return | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	s.mu.Lock() | ||||||
|  | 	if s.terminating { | ||||||
|  | 		s.mu.Unlock() | ||||||
|  | 		return nil | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return s.stopServices(ctx, serviceIDs, true) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | //nolint: gocyclo | ||||||
|  | func (s *singleton) stopServices(ctx context.Context, services []string, waitForRevDependencies bool) error { | ||||||
|  | 	stateCopy := make(map[string]*ServiceRunner) | ||||||
|  | 
 | ||||||
|  | 	if services == nil { | ||||||
|  | 		for name, svcrunner := range s.state { | ||||||
|  | 			stateCopy[name] = svcrunner | ||||||
|  | 		} | ||||||
|  | 	} else { | ||||||
|  | 		for _, name := range services { | ||||||
|  | 			if _, ok := s.state[name]; !ok { | ||||||
|  | 				continue | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			stateCopy[name] = s.state[name] | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	s.mu.Unlock() | 	s.mu.Unlock() | ||||||
| 
 | 
 | ||||||
| 	// build reverse dependencies | 	// build reverse dependencies | ||||||
| 	reverseDependencies := make(map[string][]string) | 	reverseDependencies := make(map[string][]string) | ||||||
| 
 | 
 | ||||||
| 	for name, svcrunner := range stateCopy { | 	if waitForRevDependencies { | ||||||
| 		for _, dependency := range svcrunner.service.DependsOn(s.runtime) { | 		for name, svcrunner := range stateCopy { | ||||||
| 			reverseDependencies[dependency] = append(reverseDependencies[dependency], name) | 			for _, dependency := range svcrunner.service.DependsOn(s.runtime) { | ||||||
|  | 				reverseDependencies[dependency] = append(reverseDependencies[dependency], name) | ||||||
|  | 			} | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| @ -223,12 +275,16 @@ func (s *singleton) Shutdown() { | |||||||
| 	var shutdownWg sync.WaitGroup | 	var shutdownWg sync.WaitGroup | ||||||
| 
 | 
 | ||||||
| 	// wait max 30 seconds for reverse deps to shut down | 	// wait max 30 seconds for reverse deps to shut down | ||||||
| 	shutdownCtx, shutdownCtxCancel := context.WithTimeout(context.Background(), 30*time.Second) | 	shutdownCtx, shutdownCtxCancel := context.WithTimeout(ctx, 30*time.Second) | ||||||
| 	defer shutdownCtxCancel() | 	defer shutdownCtxCancel() | ||||||
| 
 | 
 | ||||||
|  | 	stoppedConds := []conditions.Condition{} | ||||||
|  | 
 | ||||||
| 	for name, svcrunner := range stateCopy { | 	for name, svcrunner := range stateCopy { | ||||||
| 		shutdownWg.Add(1) | 		shutdownWg.Add(1) | ||||||
| 
 | 
 | ||||||
|  | 		stoppedConds = append(stoppedConds, WaitForService(StateEventDown, name)) | ||||||
|  | 
 | ||||||
| 		go func(svcrunner *ServiceRunner, reverseDeps []string) { | 		go func(svcrunner *ServiceRunner, reverseDeps []string) { | ||||||
| 			defer shutdownWg.Done() | 			defer shutdownWg.Done() | ||||||
| 
 | 
 | ||||||
| @ -238,8 +294,10 @@ func (s *singleton) Shutdown() { | |||||||
| 				conds[i] = WaitForService(StateEventDown, reverseDeps[i]) | 				conds[i] = WaitForService(StateEventDown, reverseDeps[i]) | ||||||
| 			} | 			} | ||||||
| 
 | 
 | ||||||
| 			// nolint: errcheck | 			allDeps := conditions.WaitForAll(conds...) | ||||||
| 			_ = conditions.WaitForAll(conds...).Wait(shutdownCtx) | 			if err := allDeps.Wait(shutdownCtx); err != nil { | ||||||
|  | 				log.Printf("gave up on %s while stopping %q", allDeps, svcrunner.id) | ||||||
|  | 			} | ||||||
| 
 | 
 | ||||||
| 			svcrunner.Shutdown() | 			svcrunner.Shutdown() | ||||||
| 		}(svcrunner, reverseDependencies[name]) | 		}(svcrunner, reverseDependencies[name]) | ||||||
| @ -247,7 +305,7 @@ func (s *singleton) Shutdown() { | |||||||
| 
 | 
 | ||||||
| 	shutdownWg.Wait() | 	shutdownWg.Wait() | ||||||
| 
 | 
 | ||||||
| 	s.wg.Wait() | 	return conditions.WaitForAll(stoppedConds...).Wait(ctx) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // List returns snapshot of ServiceRunner instances. | // List returns snapshot of ServiceRunner instances. | ||||||
| @ -267,44 +325,6 @@ func (s *singleton) List() (result []*ServiceRunner) { | |||||||
| 	return | 	return | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Stop will initiate a shutdown of the specified service. |  | ||||||
| func (s *singleton) Stop(ctx context.Context, serviceIDs ...string) (err error) { |  | ||||||
| 	if len(serviceIDs) == 0 { |  | ||||||
| 		return |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	s.mu.Lock() |  | ||||||
| 	if s.terminating { |  | ||||||
| 		s.mu.Unlock() |  | ||||||
| 		return |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	// Copy current service state |  | ||||||
| 	stateCopy := make(map[string]*ServiceRunner) |  | ||||||
| 
 |  | ||||||
| 	for _, id := range serviceIDs { |  | ||||||
| 		if _, ok := s.state[id]; !ok { |  | ||||||
| 			return fmt.Errorf("service not found: %s", id) |  | ||||||
| 		} |  | ||||||
| 
 |  | ||||||
| 		stateCopy[id] = s.state[id] |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	s.mu.Unlock() |  | ||||||
| 
 |  | ||||||
| 	conds := make([]conditions.Condition, 0, len(stateCopy)) |  | ||||||
| 
 |  | ||||||
| 	// Initiate a shutdown on the specific service |  | ||||||
| 	for id, svcrunner := range stateCopy { |  | ||||||
| 		svcrunner.Shutdown() |  | ||||||
| 
 |  | ||||||
| 		conds = append(conds, WaitForService(StateEventDown, id)) |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	// Wait for service to actually shut down |  | ||||||
| 	return conditions.WaitForAll(conds...).Wait(ctx) |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| // IsRunning checks service status (started/stopped). | // IsRunning checks service status (started/stopped). | ||||||
| // | // | ||||||
| // It doesn't check if service runner was started or not, just pure | // It doesn't check if service runner was started or not, just pure | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user