From fe63664164aa641fa34430d555fe5def39bd455e Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Mon, 13 Nov 2023 00:59:48 -0800 Subject: [PATCH] prom: Add drive failure tolerance per erasure set (#18424) --- cmd/erasure-server-pool.go | 53 ++++++++++++++++---------------------- cmd/metrics-v2.go | 23 +++++++++++++++++ 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 2b3e9752f..0397be813 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -2253,11 +2253,12 @@ type HealthOptions struct { // additionally with any specific heuristic information which // was queried type HealthResult struct { - Healthy bool - HealingDrives int - UnhealthyPools []struct { + Healthy bool + HealingDrives int + ESHealth []struct { Maintenance bool PoolID, SetID int + HealthyDrives int WriteQuorum int } WriteQuorum int @@ -2372,50 +2373,40 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea } result := HealthResult{ - HealingDrives: len(aggHealStateResult.HealDisks), + Healthy: true, WriteQuorum: maximumWriteQuorum, UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node. } for poolIdx := range erasureSetUpCount { for setIdx := range erasureSetUpCount[poolIdx] { + result.ESHealth = append(result.ESHealth, struct { + Maintenance bool + PoolID, SetID int + HealthyDrives, WriteQuorum int + }{ + Maintenance: opts.Maintenance, + SetID: setIdx, + PoolID: poolIdx, + HealthyDrives: erasureSetUpCount[poolIdx][setIdx], + WriteQuorum: poolWriteQuorums[poolIdx], + }) + if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", poolIdx, setIdx, poolWriteQuorums[poolIdx])) - result.UnhealthyPools = append(result.UnhealthyPools, struct { - Maintenance bool - PoolID, SetID, WriteQuorum int - }{ - Maintenance: opts.Maintenance, - SetID: setIdx, - PoolID: poolIdx, - WriteQuorum: poolWriteQuorums[poolIdx], - }) + result.Healthy = false } } - if len(result.UnhealthyPools) > 0 { - // We have unhealthy pools return error. - return result - } } - // when maintenance is not specified we don't have - // to look at the healing side of the code. - if !opts.Maintenance { - return HealthResult{ - Healthy: true, - WriteQuorum: maximumWriteQuorum, - UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node. - } + if opts.Maintenance { + result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0 + result.HealingDrives = len(aggHealStateResult.HealDisks) } - return HealthResult{ - Healthy: len(aggHealStateResult.HealDisks) == 0, - HealingDrives: len(aggHealStateResult.HealDisks), - WriteQuorum: maximumWriteQuorum, - UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node. - } + return result } // PutObjectMetadata - replace or add tags to an existing object diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index ceedb2598..2fdfe533f 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -22,6 +22,7 @@ import ( "fmt" "net/http" "runtime" + "strconv" "strings" "sync" "sync/atomic" @@ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription { } } +func getClusterErasureSetToleranceMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: "health", + Name: "erasure_set_tolerance", + Help: "Get erasure set tolerance status", + Type: gaugeMetric, + } +} + func getClusterHealthMetrics() *MetricsGroup { mg := &MetricsGroup{ cacheInterval: 10 * time.Second, @@ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup { Value: float64(health), }) + for _, h := range result.ESHealth { + labels := map[string]string{ + "pool": strconv.Itoa(h.PoolID), + "set": strconv.Itoa(h.SetID), + } + metrics = append(metrics, Metric{ + Description: getClusterErasureSetToleranceMD(), + VariableLabels: labels, + Value: float64(h.HealthyDrives - h.WriteQuorum), + }) + } + return })