From fc0fac8543457a71e7b8b9f663bcc6f76c34296f Mon Sep 17 00:00:00 2001 From: Nelson Isioma Date: Thu, 21 Aug 2025 10:40:06 +0100 Subject: [PATCH] Add passive health checks --- .../dynamic-configuration/docker-labels.yml | 2 + .../reference/dynamic-configuration/file.toml | 3 + .../reference/dynamic-configuration/file.yaml | 3 + .../kubernetes-crd-definition-v1.yml | 95 ++++++++++ .../reference/dynamic-configuration/kv-ref.md | 2 + .../traefik.io_ingressroutes.yaml | 19 ++ .../traefik.io_middlewares.yaml | 19 ++ .../traefik.io_traefikservices.yaml | 57 ++++++ .../http/load-balancing/service.md | 34 +++- .../fixtures/healthcheck/simple_passive.toml | 31 ++++ integration/fixtures/k8s/01-traefik-crd.yml | 95 ++++++++++ integration/healthcheck_test.go | 47 +++++ pkg/config/dynamic/http_config.go | 24 ++- pkg/config/dynamic/zz_generated.deepcopy.go | 21 +++ pkg/healthcheck/healthcheck.go | 162 ++++++++++++++++++ .../kubernetes/crd/kubernetes_http.go | 15 ++ .../crd/traefikio/v1alpha1/ingressroute.go | 9 + .../v1alpha1/zz_generated.deepcopy.go | 31 ++++ pkg/server/service/service.go | 18 +- pkg/server/service/service_test.go | 15 ++ 20 files changed, 696 insertions(+), 6 deletions(-) create mode 100644 integration/fixtures/healthcheck/simple_passive.toml diff --git a/docs/content/reference/dynamic-configuration/docker-labels.yml b/docs/content/reference/dynamic-configuration/docker-labels.yml index 3399c2762..42bbed1da 100644 --- a/docs/content/reference/dynamic-configuration/docker-labels.yml +++ b/docs/content/reference/dynamic-configuration/docker-labels.yml @@ -213,6 +213,8 @@ - "traefik.http.services.service02.loadbalancer.healthcheck.timeout=42s" - "traefik.http.services.service02.loadbalancer.healthcheck.unhealthyinterval=42s" - "traefik.http.services.service02.loadbalancer.passhostheader=true" +- "traefik.http.services.service02.loadbalancer.passivehealthcheck.failurewindow=42s" +- "traefik.http.services.service02.loadbalancer.passivehealthcheck.maxfailedattempts=42" - "traefik.http.services.service02.loadbalancer.responseforwarding.flushinterval=42s" - "traefik.http.services.service02.loadbalancer.serverstransport=foobar" - "traefik.http.services.service02.loadbalancer.sticky=true" diff --git a/docs/content/reference/dynamic-configuration/file.toml b/docs/content/reference/dynamic-configuration/file.toml index b7f30c649..14d9753ce 100644 --- a/docs/content/reference/dynamic-configuration/file.toml +++ b/docs/content/reference/dynamic-configuration/file.toml @@ -93,6 +93,9 @@ [http.services.Service02.loadBalancer.healthCheck.headers] name0 = "foobar" name1 = "foobar" + [http.services.Service02.loadBalancer.passiveHealthCheck] + failureWindow = "42s" + maxFailedAttempts = 42 [http.services.Service02.loadBalancer.responseForwarding] flushInterval = "42s" [http.services.Service03] diff --git a/docs/content/reference/dynamic-configuration/file.yaml b/docs/content/reference/dynamic-configuration/file.yaml index 2ac73fd85..99a79e291 100644 --- a/docs/content/reference/dynamic-configuration/file.yaml +++ b/docs/content/reference/dynamic-configuration/file.yaml @@ -98,6 +98,9 @@ http: headers: name0: foobar name1: foobar + passiveHealthCheck: + failureWindow: 42s + maxFailedAttempts: 42 passHostHeader: true responseForwarding: flushInterval: 42s diff --git a/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml b/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml index c4f6dc076..bc4689191 100644 --- a/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml +++ b/docs/content/reference/dynamic-configuration/kubernetes-crd-definition-v1.yml @@ -227,6 +227,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health + checks for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window + during which the failed attempts must occur for + the server to be marked as unhealthy. It also defines + for how long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -1169,6 +1188,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how long + the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window before + marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -2944,6 +2982,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how + long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object percent: description: |- Percent defines the part of the traffic to mirror. @@ -3078,6 +3135,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server to be + marked as unhealthy. It also defines for how long the server + will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window before + marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -3290,6 +3366,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how + long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer diff --git a/docs/content/reference/dynamic-configuration/kv-ref.md b/docs/content/reference/dynamic-configuration/kv-ref.md index a7c1b217f..8c0737abb 100644 --- a/docs/content/reference/dynamic-configuration/kv-ref.md +++ b/docs/content/reference/dynamic-configuration/kv-ref.md @@ -288,6 +288,8 @@ THIS FILE MUST NOT BE EDITED BY HAND | `traefik/http/services/Service02/loadBalancer/healthCheck/timeout` | `42s` | | `traefik/http/services/Service02/loadBalancer/healthCheck/unhealthyInterval` | `42s` | | `traefik/http/services/Service02/loadBalancer/passHostHeader` | `true` | +| `traefik/http/services/Service02/loadBalancer/passiveHealthCheck/failureWindow` | `42s` | +| `traefik/http/services/Service02/loadBalancer/passiveHealthCheck/maxFailedAttempts` | `42` | | `traefik/http/services/Service02/loadBalancer/responseForwarding/flushInterval` | `42s` | | `traefik/http/services/Service02/loadBalancer/servers/0/preservePath` | `true` | | `traefik/http/services/Service02/loadBalancer/servers/0/url` | `foobar` | diff --git a/docs/content/reference/dynamic-configuration/traefik.io_ingressroutes.yaml b/docs/content/reference/dynamic-configuration/traefik.io_ingressroutes.yaml index 46cc78976..0ffc11dfb 100644 --- a/docs/content/reference/dynamic-configuration/traefik.io_ingressroutes.yaml +++ b/docs/content/reference/dynamic-configuration/traefik.io_ingressroutes.yaml @@ -227,6 +227,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health + checks for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window + during which the failed attempts must occur for + the server to be marked as unhealthy. It also defines + for how long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer diff --git a/docs/content/reference/dynamic-configuration/traefik.io_middlewares.yaml b/docs/content/reference/dynamic-configuration/traefik.io_middlewares.yaml index d658c6602..97135b70d 100644 --- a/docs/content/reference/dynamic-configuration/traefik.io_middlewares.yaml +++ b/docs/content/reference/dynamic-configuration/traefik.io_middlewares.yaml @@ -381,6 +381,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how long + the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window before + marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer diff --git a/docs/content/reference/dynamic-configuration/traefik.io_traefikservices.yaml b/docs/content/reference/dynamic-configuration/traefik.io_traefikservices.yaml index 1e6fec094..1ed24d021 100644 --- a/docs/content/reference/dynamic-configuration/traefik.io_traefikservices.yaml +++ b/docs/content/reference/dynamic-configuration/traefik.io_traefikservices.yaml @@ -245,6 +245,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how + long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object percent: description: |- Percent defines the part of the traffic to mirror. @@ -379,6 +398,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server to be + marked as unhealthy. It also defines for how long the server + will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window before + marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -591,6 +629,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how + long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer diff --git a/docs/content/reference/routing-configuration/http/load-balancing/service.md b/docs/content/reference/routing-configuration/http/load-balancing/service.md index b99a5b3c3..fb5e0173d 100644 --- a/docs/content/reference/routing-configuration/http/load-balancing/service.md +++ b/docs/content/reference/routing-configuration/http/load-balancing/service.md @@ -27,6 +27,9 @@ http: path: "/health" interval: "10s" timeout: "3s" + passiveHealthcheck: + failureWindow: "3s" + maxFailedAttempts: "3" passHostHeader: true serversTransport: "customTransport@file" responseForwarding: @@ -46,6 +49,10 @@ http: path = "/health" interval = "10s" timeout = "3s" + + [http.services.my-service.loadBalancer.passiveHealthcheck] + failureWindow = "3s" + maxFailedAttempts = "3" passHostHeader = true serversTransport = "customTransport@file" @@ -63,6 +70,8 @@ labels: - "traefik.http.services.my-service.loadBalancer.healthcheck.path=/health" - "traefik.http.services.my-service.loadBalancer.healthcheck.interval=10s" - "traefik.http.services.my-service.loadBalancer.healthcheck.timeout=3s" + - "traefik.http.services.my-service.loadBalancer.passiveHealthcheck.failureWindow=3s" + - "traefik.http.services.my-service.loadBalancer.passiveHealthcheck.maxFailedAttempts=3" - "traefik.http.services.my-service.loadBalancer.passHostHeader=true" - "traefik.http.services.my-service.loadBalancer.serversTransport=customTransport@file" - "traefik.http.services.my-service.loadBalancer.responseForwarding.flushInterval=150ms" @@ -78,6 +87,8 @@ labels: "traefik.http.services.my-service.loadBalancer.healthcheck.path=/health", "traefik.http.services.my-service.loadBalancer.healthcheck.interval=10s", "traefik.http.services.my-service.loadBalancer.healthcheck.timeout=3s", + "traefik.http.services.my-service.loadBalancer.passiveHealthcheck.failureWindow=3s", + "traefik.http.services.my-service.loadBalancer.passiveHealthcheck.maxFailedAttempts=3", "traefik.http.services.my-service.loadBalancer.passHostHeader=true", "traefik.http.services.my-service.loadBalancer.serversTransport=customTransport@file", "traefik.http.services.my-service.loadBalancer.responseForwarding.flushInterval=150ms" @@ -92,6 +103,7 @@ labels: | `servers` | Represents individual backend instances for your service | Yes | | `sticky` | Defines a `Set-Cookie` header is set on the initial response to let the client know which server handles the first response. | No | | `healthcheck` | Configures health check to remove unhealthy servers from the load balancing rotation. | No | +| `passiveHealthcheck` | Configures the passive health check to remove unhealthy servers from the load balancing rotation. | No | | `passHostHeader` | Allows forwarding of the client Host header to server. By default, `passHostHeader` is true. | No | | `serversTransport` | Allows to reference an [HTTP ServersTransport](./serverstransport.md) configuration for the communication between Traefik and your servers. If no `serversTransport` is specified, the `default@internal` will be used. | No | | `responseForwarding` | Configures how Traefik forwards the response from the backend server to the client. | No | @@ -111,7 +123,9 @@ Servers represent individual backend instances for your service. The [service lo #### Health Check -The `healthcheck` option configures health check to remove unhealthy servers from the load balancing rotation. Traefik will consider HTTP(s) servers healthy as long as they return a status code to the health check request (carried out every interval) between `2XX` and `3XX`, or matching the configured status. For gRPC servers, Traefik will consider them healthy as long as they return SERVING to [gRPC health check v1 requests](https://github.com/grpc/grpc/blob/master/doc/health-checking.md). +The `healthcheck` option configures health check to remove unhealthy servers from the load balancing rotation. +Traefik will consider HTTP(s) servers healthy as long as they return a status code to the health check request (carried out every interval) between `2XX` and `3XX`, or matching the configured status. +For gRPC servers, Traefik will consider them healthy as long as they return SERVING to [gRPC health check v1 requests](https://github.com/grpc/grpc/blob/master/doc/health-checking.md). To propagate status changes (e.g. all servers of this service are down) upwards, HealthCheck must also be enabled on the parent(s) of this service. @@ -133,6 +147,24 @@ Below are the available options for the health check mechanism: | `method` | Defines the HTTP method that will be used while connecting to the endpoint. | GET | No | | `status` | Defines the expected HTTP status code of the response to the health check request. | | No | +#### Passive Health Check + +The `passiveHealthcheck` option configures passive health check to remove unhealthy servers from the load balancing rotation. + +Passive health checks rely on real traffic to assess server health. +Traefik forwards requests as usual and evaluates each response or timeout, +incrementing a failure counter whenever a request fails. +If the number of successive failures within a specified time window exceeds the configured threshold, +Traefik will automatically stop routing traffic to that server until it recovers. +A server will be considered healthy again after the configured failure window has passed. + +Below are the available options for the passive health check mechanism: + +| Field | Description | Default | Required | +|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|----------| +| `failureWindow` | Defines the time window during which the failed attempts must occur for the server to be marked as unhealthy. It also defines for how long the server will be considered unhealthy. | 10s | No | +| `maxFailedAttempts` | Defines the number of consecutive failed attempts allowed within the failure window before marking the server as unhealthy. | 1 | No | + ## Weighted Round Robin (WRR) The WRR is able to load balance the requests between multiple services based on weights. diff --git a/integration/fixtures/healthcheck/simple_passive.toml b/integration/fixtures/healthcheck/simple_passive.toml new file mode 100644 index 000000000..02fb698b4 --- /dev/null +++ b/integration/fixtures/healthcheck/simple_passive.toml @@ -0,0 +1,31 @@ +[global] + checkNewVersion = false + sendAnonymousUsage = false + +[log] + level = "DEBUG" + noColor = true + +[entryPoints] + [entryPoints.web] + address = ":8000" + +[api] + insecure = true + +[providers.file] + filename = "{{ .SelfFilename }}" + +## dynamic configuration ## + +[http.routers] + [http.routers.router1] + service = "service1" + rule = "Host(`test.localhost`)" + +[http.services] + [http.services.service1.loadBalancer] + [http.services.service1.loadBalancer.passiveHealthCheck] + failureWindow = "2s" + [[http.services.service1.loadBalancer.servers]] + url = "http://{{.Server1}}:80" diff --git a/integration/fixtures/k8s/01-traefik-crd.yml b/integration/fixtures/k8s/01-traefik-crd.yml index c4f6dc076..bc4689191 100644 --- a/integration/fixtures/k8s/01-traefik-crd.yml +++ b/integration/fixtures/k8s/01-traefik-crd.yml @@ -227,6 +227,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health + checks for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window + during which the failed attempts must occur for + the server to be marked as unhealthy. It also defines + for how long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -1169,6 +1188,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how long + the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window before + marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -2944,6 +2982,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how + long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object percent: description: |- Percent defines the part of the traffic to mirror. @@ -3078,6 +3135,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server to be + marked as unhealthy. It also defines for how long the server + will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window before + marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer @@ -3290,6 +3366,25 @@ spec: PassHostHeader defines whether the client Host header is forwarded to the upstream Kubernetes Service. By default, passHostHeader is true. type: boolean + passiveHealthCheck: + description: PassiveHealthCheck defines passive health checks + for ExternalName services. + properties: + failureWindow: + anyOf: + - type: integer + - type: string + description: FailureWindow defines the time window during + which the failed attempts must occur for the server + to be marked as unhealthy. It also defines for how + long the server will be considered unhealthy. + x-kubernetes-int-or-string: true + maxFailedAttempts: + description: MaxFailedAttempts is the number of consecutive + failed attempts allowed within the failure window + before marking the server as unhealthy. + type: integer + type: object port: anyOf: - type: integer diff --git a/integration/healthcheck_test.go b/integration/healthcheck_test.go index 1276f09dd..6ab2ac618 100644 --- a/integration/healthcheck_test.go +++ b/integration/healthcheck_test.go @@ -108,6 +108,53 @@ func (s *HealthCheckSuite) TestSimpleConfiguration() { assert.Equal(s.T(), http.StatusNotFound, resp.StatusCode) } +func (s *HealthCheckSuite) TestSimpleConfiguration_Passive() { + file := s.adaptFile("fixtures/healthcheck/simple_passive.toml", struct { + Server1 string + }{s.whoami1IP}) + + s.traefikCmd(withConfigFile(file)) + + // wait for traefik + err := try.GetRequest("http://127.0.0.1:8080/api/rawdata", 60*time.Second, try.BodyContains("Host(`test.localhost`)")) + require.NoError(s.T(), err) + + frontendHealthReq, err := http.NewRequest(http.MethodGet, "http://127.0.0.1:8000/health", nil) + require.NoError(s.T(), err) + frontendHealthReq.Host = "test.localhost" + + err = try.Request(frontendHealthReq, 500*time.Millisecond, try.StatusCodeIs(http.StatusOK)) + require.NoError(s.T(), err) + + // Fix all whoami health to 500 + client := &http.Client{} + whoamiHosts := []string{s.whoami1IP, s.whoami2IP} + for _, whoami := range whoamiHosts { + statusInternalServerErrorReq, err := http.NewRequest(http.MethodPost, "http://"+whoami+"/health", bytes.NewBufferString("500")) + require.NoError(s.T(), err) + _, err = client.Do(statusInternalServerErrorReq) + require.NoError(s.T(), err) + } + + // First call, the passive health check is not yet triggered, so we expect a 500. + err = try.Request(frontendHealthReq, 3*time.Second, try.StatusCodeIs(http.StatusInternalServerError)) + require.NoError(s.T(), err) + + // Verify no backend service is available due to failing health checks + err = try.Request(frontendHealthReq, 3*time.Second, try.StatusCodeIs(http.StatusServiceUnavailable)) + require.NoError(s.T(), err) + + // Change one whoami health to 200 + statusOKReq1, err := http.NewRequest(http.MethodPost, "http://"+s.whoami1IP+"/health", bytes.NewBufferString("200")) + require.NoError(s.T(), err) + _, err = client.Do(statusOKReq1) + require.NoError(s.T(), err) + + // Verify frontend health : after + err = try.Request(frontendHealthReq, 3*time.Second, try.StatusCodeIs(http.StatusOK)) + require.NoError(s.T(), err) +} + func (s *HealthCheckSuite) TestMultipleEntrypoints() { file := s.adaptFile("fixtures/healthcheck/multiple-entrypoints.toml", struct { Server1 string diff --git a/pkg/config/dynamic/http_config.go b/pkg/config/dynamic/http_config.go index ef6a896f1..a60f59713 100644 --- a/pkg/config/dynamic/http_config.go +++ b/pkg/config/dynamic/http_config.go @@ -244,10 +244,12 @@ type ServersLoadBalancer struct { // children servers of this load-balancer. To propagate status changes (e.g. all // servers of this service are down) upwards, HealthCheck must also be enabled on // the parent(s) of this service. - HealthCheck *ServerHealthCheck `json:"healthCheck,omitempty" toml:"healthCheck,omitempty" yaml:"healthCheck,omitempty" export:"true"` - PassHostHeader *bool `json:"passHostHeader" toml:"passHostHeader" yaml:"passHostHeader" export:"true"` - ResponseForwarding *ResponseForwarding `json:"responseForwarding,omitempty" toml:"responseForwarding,omitempty" yaml:"responseForwarding,omitempty" export:"true"` - ServersTransport string `json:"serversTransport,omitempty" toml:"serversTransport,omitempty" yaml:"serversTransport,omitempty" export:"true"` + HealthCheck *ServerHealthCheck `json:"healthCheck,omitempty" toml:"healthCheck,omitempty" yaml:"healthCheck,omitempty" export:"true"` + // PassiveHealthCheck enables passive health checks for children servers of this load-balancer. + PassiveHealthCheck *PassiveServerHealthCheck `json:"passiveHealthCheck,omitempty" toml:"passiveHealthCheck,omitempty" yaml:"passiveHealthCheck,omitempty" export:"true"` + PassHostHeader *bool `json:"passHostHeader" toml:"passHostHeader" yaml:"passHostHeader" export:"true"` + ResponseForwarding *ResponseForwarding `json:"responseForwarding,omitempty" toml:"responseForwarding,omitempty" yaml:"responseForwarding,omitempty" export:"true"` + ServersTransport string `json:"serversTransport,omitempty" toml:"serversTransport,omitempty" yaml:"serversTransport,omitempty" export:"true"` } // Mergeable tells if the given service is mergeable. @@ -336,6 +338,20 @@ func (h *ServerHealthCheck) SetDefaults() { // +k8s:deepcopy-gen=true +type PassiveServerHealthCheck struct { + // FailureWindow defines the time window during which the failed attempts must occur for the server to be marked as unhealthy. It also defines for how long the server will be considered unhealthy. + FailureWindow ptypes.Duration `json:"failureWindow,omitempty" toml:"failureWindow,omitempty" yaml:"failureWindow,omitempty" export:"true"` + // MaxFailedAttempts is the number of consecutive failed attempts allowed within the failure window before marking the server as unhealthy. + MaxFailedAttempts int `json:"maxFailedAttempts,omitempty" toml:"maxFailedAttempts,omitempty" yaml:"maxFailedAttempts,omitempty" export:"true"` +} + +func (p *PassiveServerHealthCheck) SetDefaults() { + p.FailureWindow = ptypes.Duration(10 * time.Second) + p.MaxFailedAttempts = 1 +} + +// +k8s:deepcopy-gen=true + // HealthCheck controls healthcheck awareness and propagation at the services level. type HealthCheck struct{} diff --git a/pkg/config/dynamic/zz_generated.deepcopy.go b/pkg/config/dynamic/zz_generated.deepcopy.go index 97f8907e0..94d0f11a8 100644 --- a/pkg/config/dynamic/zz_generated.deepcopy.go +++ b/pkg/config/dynamic/zz_generated.deepcopy.go @@ -1071,6 +1071,22 @@ func (in *PassTLSClientCert) DeepCopy() *PassTLSClientCert { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PassiveServerHealthCheck) DeepCopyInto(out *PassiveServerHealthCheck) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PassiveServerHealthCheck. +func (in *PassiveServerHealthCheck) DeepCopy() *PassiveServerHealthCheck { + if in == nil { + return nil + } + out := new(PassiveServerHealthCheck) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProxyProtocol) DeepCopyInto(out *ProxyProtocol) { *out = *in @@ -1478,6 +1494,11 @@ func (in *ServersLoadBalancer) DeepCopyInto(out *ServersLoadBalancer) { *out = new(ServerHealthCheck) (*in).DeepCopyInto(*out) } + if in.PassiveHealthCheck != nil { + in, out := &in.PassiveHealthCheck, &out.PassiveHealthCheck + *out = new(PassiveServerHealthCheck) + **out = **in + } if in.PassHostHeader != nil { in, out := &in.PassHostHeader, &out.PassHostHeader *out = new(bool) diff --git a/pkg/healthcheck/healthcheck.go b/pkg/healthcheck/healthcheck.go index e2679d73c..e1be3343e 100644 --- a/pkg/healthcheck/healthcheck.go +++ b/pkg/healthcheck/healthcheck.go @@ -1,19 +1,24 @@ package healthcheck import ( + "bufio" "context" "errors" "fmt" "net" "net/http" + "net/http/httptrace" "net/url" "strconv" + "sync" "time" gokitmetrics "github.com/go-kit/kit/metrics" "github.com/rs/zerolog/log" + ptypes "github.com/traefik/paerser/types" "github.com/traefik/traefik/v3/pkg/config/dynamic" "github.com/traefik/traefik/v3/pkg/config/runtime" + "golang.org/x/sync/singleflight" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials/insecure" @@ -322,3 +327,160 @@ func (shc *ServiceHealthChecker) checkHealthGRPC(ctx context.Context, serverURL return nil } + +type PassiveServiceHealthChecker struct { + serviceName string + balancer StatusSetter + metrics metricsHealthCheck + + maxFailedAttempts int + failureWindow ptypes.Duration + hasActiveHealthCheck bool + + failuresMu sync.RWMutex + failures map[string][]time.Time + + timersGroup singleflight.Group + timers sync.Map +} + +func NewPassiveHealthChecker(serviceName string, balancer StatusSetter, maxFailedAttempts int, failureWindow ptypes.Duration, hasActiveHealthCheck bool, metrics metricsHealthCheck) *PassiveServiceHealthChecker { + return &PassiveServiceHealthChecker{ + serviceName: serviceName, + balancer: balancer, + failures: make(map[string][]time.Time), + maxFailedAttempts: maxFailedAttempts, + failureWindow: failureWindow, + hasActiveHealthCheck: hasActiveHealthCheck, + metrics: metrics, + } +} + +func (p *PassiveServiceHealthChecker) WrapHandler(ctx context.Context, next http.Handler, targetURL string) http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + var backendCalled bool + trace := &httptrace.ClientTrace{ + WroteHeaders: func() { + backendCalled = true + }, + WroteRequest: func(httptrace.WroteRequestInfo) { + backendCalled = true + }, + } + clientTraceCtx := httptrace.WithClientTrace(req.Context(), trace) + + codeCatcher := &codeCatcher{ + ResponseWriter: rw, + } + + next.ServeHTTP(codeCatcher, req.WithContext(clientTraceCtx)) + + if backendCalled && codeCatcher.statusCode < http.StatusInternalServerError { + p.failuresMu.Lock() + p.failures[targetURL] = nil + p.failuresMu.Unlock() + return + } + + p.failuresMu.Lock() + p.failures[targetURL] = append(p.failures[targetURL], time.Now()) + p.failuresMu.Unlock() + + if p.healthy(targetURL) { + return + } + + // We need to guarantee that only one goroutine (request) will update the status and create a timer for the target. + _, _, _ = p.timersGroup.Do(targetURL, func() (interface{}, error) { + // A timer is already running for this target; + // it means that the target is already considered unhealthy. + if _, ok := p.timers.Load(targetURL); ok { + return nil, nil + } + + p.balancer.SetStatus(ctx, targetURL, false) + p.metrics.ServiceServerUpGauge().With("service", p.serviceName, "url", targetURL).Set(0) + + // If the service has an active health check, the passive health checker should not reset the status. + // The active health check will handle the status updates. + if p.hasActiveHealthCheck { + return nil, nil + } + + go func() { + timer := time.NewTimer(time.Duration(p.failureWindow)) + defer timer.Stop() + + p.timers.Store(targetURL, timer) + + select { + case <-ctx.Done(): + case <-timer.C: + p.timers.Delete(targetURL) + + p.balancer.SetStatus(ctx, targetURL, true) + p.metrics.ServiceServerUpGauge().With("service", p.serviceName, "url", targetURL).Set(1) + } + }() + + return nil, nil + }) + }) +} + +func (p *PassiveServiceHealthChecker) healthy(targetURL string) bool { + windowStart := time.Now().Add(-time.Duration(p.failureWindow)) + + p.failuresMu.Lock() + defer p.failuresMu.Unlock() + + // Filter failures within the sliding window. + failures := p.failures[targetURL] + for i, t := range failures { + if t.After(windowStart) { + p.failures[targetURL] = failures[i:] + break + } + } + + // Check if failures exceed maxFailedAttempts. + return len(p.failures[targetURL]) < p.maxFailedAttempts +} + +type codeCatcher struct { + http.ResponseWriter + + statusCode int +} + +func (c *codeCatcher) WriteHeader(statusCode int) { + // Here we allow the overriding of the status code, + // for the health check we care about the last status code written. + c.statusCode = statusCode + c.ResponseWriter.WriteHeader(statusCode) +} + +func (c *codeCatcher) Write(bytes []byte) (int, error) { + // At the time of writing, if the status code is not set, + // or set to an informational status code (1xx), + // we set it to http.StatusOK (200). + if c.statusCode < http.StatusOK { + c.statusCode = http.StatusOK + } + + return c.ResponseWriter.Write(bytes) +} + +func (c *codeCatcher) Flush() { + if flusher, ok := c.ResponseWriter.(http.Flusher); ok { + flusher.Flush() + } +} + +func (c *codeCatcher) Hijack() (net.Conn, *bufio.ReadWriter, error) { + if h, ok := c.ResponseWriter.(http.Hijacker); ok { + return h.Hijack() + } + + return nil, nil, fmt.Errorf("not a hijacker: %T", c.ResponseWriter) +} diff --git a/pkg/provider/kubernetes/crd/kubernetes_http.go b/pkg/provider/kubernetes/crd/kubernetes_http.go index 59dfbd4a5..09c206a74 100644 --- a/pkg/provider/kubernetes/crd/kubernetes_http.go +++ b/pkg/provider/kubernetes/crd/kubernetes_http.go @@ -392,6 +392,21 @@ func (c configBuilder) buildServersLB(namespace string, svc traefikv1alpha1.Load } } + if svc.PassiveHealthCheck != nil { + lb.PassiveHealthCheck = &dynamic.PassiveServerHealthCheck{} + lb.PassiveHealthCheck.SetDefaults() + + if svc.PassiveHealthCheck.MaxFailedAttempts != nil { + lb.PassiveHealthCheck.MaxFailedAttempts = *svc.PassiveHealthCheck.MaxFailedAttempts + } + + if svc.PassiveHealthCheck.FailureWindow != nil { + if err := lb.PassiveHealthCheck.FailureWindow.Set(svc.PassiveHealthCheck.FailureWindow.String()); err != nil { + return nil, err + } + } + } + conf := svc lb.PassHostHeader = conf.PassHostHeader if lb.PassHostHeader == nil { diff --git a/pkg/provider/kubernetes/crd/traefikio/v1alpha1/ingressroute.go b/pkg/provider/kubernetes/crd/traefikio/v1alpha1/ingressroute.go index a717d6cdd..fb5513e4c 100644 --- a/pkg/provider/kubernetes/crd/traefikio/v1alpha1/ingressroute.go +++ b/pkg/provider/kubernetes/crd/traefikio/v1alpha1/ingressroute.go @@ -144,6 +144,8 @@ type LoadBalancerSpec struct { NodePortLB bool `json:"nodePortLB,omitempty"` // Healthcheck defines health checks for ExternalName services. HealthCheck *ServerHealthCheck `json:"healthCheck,omitempty"` + // PassiveHealthCheck defines passive health checks for ExternalName services. + PassiveHealthCheck *PassiveServerHealthCheck `json:"passiveHealthCheck,omitempty"` } type ResponseForwarding struct { @@ -189,6 +191,13 @@ type ServerHealthCheck struct { Headers map[string]string `json:"headers,omitempty"` } +type PassiveServerHealthCheck struct { + // FailureWindow defines the time window during which the failed attempts must occur for the server to be marked as unhealthy. It also defines for how long the server will be considered unhealthy. + FailureWindow *intstr.IntOrString `json:"failureWindow,omitempty"` + // MaxFailedAttempts is the number of consecutive failed attempts allowed within the failure window before marking the server as unhealthy. + MaxFailedAttempts *int `json:"maxFailedAttempts,omitempty"` +} + // Service defines an upstream HTTP service to proxy traffic to. type Service struct { LoadBalancerSpec `json:",inline"` diff --git a/pkg/provider/kubernetes/crd/traefikio/v1alpha1/zz_generated.deepcopy.go b/pkg/provider/kubernetes/crd/traefikio/v1alpha1/zz_generated.deepcopy.go index 3e53cfae5..6b99e8fa7 100644 --- a/pkg/provider/kubernetes/crd/traefikio/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/provider/kubernetes/crd/traefikio/v1alpha1/zz_generated.deepcopy.go @@ -657,6 +657,11 @@ func (in *LoadBalancerSpec) DeepCopyInto(out *LoadBalancerSpec) { *out = new(ServerHealthCheck) (*in).DeepCopyInto(*out) } + if in.PassiveHealthCheck != nil { + in, out := &in.PassiveHealthCheck, &out.PassiveHealthCheck + *out = new(PassiveServerHealthCheck) + (*in).DeepCopyInto(*out) + } return } @@ -1047,6 +1052,32 @@ func (in *ObjectReference) DeepCopy() *ObjectReference { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PassiveServerHealthCheck) DeepCopyInto(out *PassiveServerHealthCheck) { + *out = *in + if in.FailureWindow != nil { + in, out := &in.FailureWindow, &out.FailureWindow + *out = new(intstr.IntOrString) + **out = **in + } + if in.MaxFailedAttempts != nil { + in, out := &in.MaxFailedAttempts, &out.MaxFailedAttempts + *out = new(int) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PassiveServerHealthCheck. +func (in *PassiveServerHealthCheck) DeepCopy() *PassiveServerHealthCheck { + if in == nil { + return nil + } + out := new(PassiveServerHealthCheck) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RateLimit) DeepCopyInto(out *RateLimit) { *out = *in diff --git a/pkg/server/service/service.go b/pkg/server/service/service.go index b1bc6d8e9..6967f6e7b 100644 --- a/pkg/server/service/service.go +++ b/pkg/server/service/service.go @@ -341,7 +341,7 @@ func (m *Manager) getLoadBalancerServiceHandler(ctx context.Context, serviceName var lb serverBalancer switch service.Strategy { // Here we are handling the empty value to comply with providers that are not applying defaults (e.g. REST provider) - // TODO: remove this when all providers apply default values. + // TODO: remove this empty check when all providers apply default values. case dynamic.BalancerStrategyWRR, "": lb = wrr.New(service.Sticky, service.HealthCheck != nil) case dynamic.BalancerStrategyP2C: @@ -350,6 +350,17 @@ func (m *Manager) getLoadBalancerServiceHandler(ctx context.Context, serviceName return nil, fmt.Errorf("unsupported load-balancer strategy %q", service.Strategy) } + var passiveHealthChecker *healthcheck.PassiveServiceHealthChecker + if service.PassiveHealthCheck != nil { + passiveHealthChecker = healthcheck.NewPassiveHealthChecker( + serviceName, + lb, + service.PassiveHealthCheck.MaxFailedAttempts, + service.PassiveHealthCheck.FailureWindow, + service.HealthCheck != nil, + m.observabilityMgr.MetricsRegistry()) + } + healthCheckTargets := make(map[string]*url.URL) for i, server := range shuffle(service.Servers, m.rand) { @@ -368,6 +379,11 @@ func (m *Manager) getLoadBalancerServiceHandler(ctx context.Context, serviceName return nil, fmt.Errorf("error building proxy for server URL %s: %w", server.URL, err) } + if passiveHealthChecker != nil { + // If passive health check is enabled, we wrap the proxy with the passive health checker. + proxy = passiveHealthChecker.WrapHandler(ctx, proxy, target.String()) + } + // The retry wrapping must be done just before the proxy handler, // to make sure that the retry will not be triggered/disabled by // middlewares in the chain. diff --git a/pkg/server/service/service_test.go b/pkg/server/service/service_test.go index 7d00a69fb..4952dde12 100644 --- a/pkg/server/service/service_test.go +++ b/pkg/server/service/service_test.go @@ -10,9 +10,11 @@ import ( "net/textproto" "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + ptypes "github.com/traefik/paerser/types" "github.com/traefik/traefik/v3/pkg/config/dynamic" "github.com/traefik/traefik/v3/pkg/config/runtime" "github.com/traefik/traefik/v3/pkg/proxy/httputil" @@ -67,6 +69,19 @@ func TestGetLoadBalancer(t *testing.T) { fwd: &forwarderMock{}, expectError: false, }, + { + desc: "Succeeds when passive health checker is set", + serviceName: "test", + service: &dynamic.ServersLoadBalancer{ + Strategy: dynamic.BalancerStrategyWRR, + PassiveHealthCheck: &dynamic.PassiveServerHealthCheck{ + FailureWindow: ptypes.Duration(30 * time.Second), + MaxFailedAttempts: 3, + }, + }, + fwd: &forwarderMock{}, + expectError: false, + }, } for _, test := range testCases {