Andrey Smirnov 505b5022c4
feat(init): implement graceful shutdown of 'init' (#562)
Most crucial changes in `init/main.go`: on shutdown now Talos tries
to stop gracefully all the services. All the shutdown paths are unified,
including poweroff, reboot and panic handling on startup.

While I was at it, I also fixed bug with containers failing to start
when old snapshot is still around.

Service lifecycle is wrapped with `ServiceRunner` object now which
handles state transitions and captures events related to state changes.
Every change goes to the log as well.

There's no way to capture service state yet, but that is planned to be
implemented as RPC API for `init` which is exposed via `osd` to `osctl`.

Future steps:

1. Implement service dependencies for correct startup order and
shutdown order.

2. Implement service health, so that we can say "start trustd when
containerd is up and healthy".

3. Implement gRPC API for init, expose via osd (service status, restart,
poweroff, ...)

4. Impement 'String()' for conditions, so that we can see what service
is waiting on right now.

Signed-off-by: Andrey Smirnov <smirnov.andrey@gmail.com>
2019-04-26 16:53:19 +03:00

125 lines
3.1 KiB
Go

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
package system
import (
"sync"
"time"
"github.com/talos-systems/talos/internal/app/init/pkg/system/conditions"
"github.com/talos-systems/talos/internal/app/init/pkg/system/runner"
"github.com/talos-systems/talos/pkg/userdata"
)
type singleton struct {
UserData *userdata.UserData
// State of running services by ID
State map[string]*ServiceRunner
mu sync.Mutex
wg sync.WaitGroup
terminating bool
}
var instance *singleton
var once sync.Once
// Service is an interface describing a system service.
type Service interface {
// ID is the service id.
ID(*userdata.UserData) string
// PreFunc is invoked before a runner is created
PreFunc(*userdata.UserData) error
// Runner creates runner for the service
Runner(*userdata.UserData) (runner.Runner, error)
// PostFunc is invoked after a runner is closed.
PostFunc(*userdata.UserData) error
// ConditionFunc describes the conditions under which a service should
// start.
ConditionFunc(*userdata.UserData) conditions.ConditionFunc
}
// Services returns the instance of the system services API.
// TODO(andrewrynhard): This should be a gRPC based API availale on a local
// unix socket.
// nolint: golint
func Services(data *userdata.UserData) *singleton {
once.Do(func() {
instance = &singleton{
UserData: data,
State: make(map[string]*ServiceRunner),
}
})
return instance
}
// Start will invoke the service's Pre, Condition, and Type funcs. If the any
// error occurs in the Pre or Condition invocations, it is up to the caller to
// to restart the service.
func (s *singleton) Start(services ...Service) {
s.mu.Lock()
if s.terminating {
return
}
defer s.mu.Unlock()
for _, service := range services {
id := service.ID(s.UserData)
if _, exists := s.State[id]; exists {
// service already started?
// TODO: it might be nice to handle case when service
// should be restarted (e.g. kubeadm after reset)
continue
}
svcrunner := NewServiceRunner(service, s.UserData)
s.State[id] = svcrunner
s.wg.Add(1)
go func(svcrunner *ServiceRunner) {
defer s.wg.Done()
svcrunner.Start()
}(svcrunner)
}
}
// ShutdownHackySleep is a variable to allow tests to override it
//
// TODO: part of a hack below
var ShutdownHackySleep = 10 * time.Second
// Shutdown all the services
func (s *singleton) Shutdown() {
s.mu.Lock()
if s.terminating {
s.mu.Unlock()
return
}
s.terminating = true
// TODO: this is a hack, we stop all service runners but containerd/udevd first.
// Tis is required for correct shutdown until service dependencies
// are implemented properly.
for name, svcrunner := range s.State {
if name != "containerd" && name != "udevd" {
svcrunner.Shutdown()
}
}
// TODO: 2nd part of a hack above
// sleep a bit to let containers actually terminate before stopping containerd
time.Sleep(ShutdownHackySleep)
for _, svcrunner := range s.State {
svcrunner.Shutdown()
}
s.mu.Unlock()
s.wg.Wait()
}