From 05bb655efc09f3fb6bf6079a8e58c8b4c63568a0 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 4 Dec 2023 11:54:13 -0800 Subject: [PATCH] avoid caching metrics for timeout errors per drive (#18584) Bonus: combine the loop for drive/REST registration. --- cmd/storage-rest-server.go | 23 +++++++++-------------- cmd/xl-storage-disk-id-check.go | 23 +++++++++++++++-------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go index dff9a7233..c0b8fd9c0 100644 --- a/cmd/storage-rest-server.go +++ b/cmd/storage-rest-server.go @@ -1348,6 +1348,7 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin if !endpoint.IsLocal { continue } + driveHandlers[pool][set] = &storageRESTServer{} server := driveHandlers[pool][set] @@ -1392,15 +1393,8 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin Handle: server.WalkDirHandler, OutCapacity: 1, }), "unable to register handler") - } - } - for pool, serverPool := range endpointServerPools { - for set, endpoint := range serverPool.Endpoints { - if !endpoint.IsLocal { - continue - } - createStorage := func(pool, set int, endpoint Endpoint) bool { + createStorage := func(server *storageRESTServer) bool { xl, err := newXLStorage(endpoint, false) if err != nil { // if supported errors don't fail, we proceed to @@ -1410,21 +1404,22 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin } storage := newXLStorageDiskIDCheck(xl, true) storage.SetDiskID(xl.diskID) - driveHandlers[pool][set].setStorage(storage) + server.setStorage(storage) return true } - if createStorage(pool, set, endpoint) { + if createStorage(server) { continue } // Start async goroutine to create storage. - go func(pool, set int, endpoint Endpoint) { + go func(server *storageRESTServer) { for { - time.Sleep(time.Minute) - if createStorage(pool, set, endpoint) { + time.Sleep(5 * time.Second) + if createStorage(server) { return } } - }(pool, set, endpoint) + }(server) + } } } diff --git a/cmd/xl-storage-disk-id-check.go b/cmd/xl-storage-disk-id-check.go index e7d2e73d5..a446b01a1 100644 --- a/cmd/xl-storage-disk-id-check.go +++ b/cmd/xl-storage-disk-id-check.go @@ -78,8 +78,9 @@ const ( // Detects change in underlying disk. type xlStorageDiskIDCheck struct { - totalErrsAvailability uint64 // Captures all data availability errors such as permission denied, faulty disk and timeout errors. - totalErrsTimeout uint64 // Captures all timeout only errors + totalErrsTimeout atomic.Uint64 // Captures all timeout only errors + totalErrsAvailability atomic.Uint64 // Captures all data availability errors such as permission denied, faulty disk and timeout errors. + // apiCalls should be placed first so alignment is guaranteed for atomic operations. apiCalls [storageMetricLast]uint64 apiLatencies [storageMetricLast]*lockedLastMinuteLatency @@ -102,7 +103,7 @@ type xlStorageDiskIDCheck struct { func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics { p.metricsCache.Once.Do(func() { - p.metricsCache.TTL = 1 * time.Second + p.metricsCache.TTL = 5 * time.Second p.metricsCache.Update = func() (interface{}, error) { diskMetric := DiskMetrics{ LastMinute: make(map[string]AccElem, len(p.apiLatencies)), @@ -114,13 +115,19 @@ func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics { for i := range p.apiCalls { diskMetric.APICalls[storageMetric(i).String()] = atomic.LoadUint64(&p.apiCalls[i]) } - diskMetric.TotalErrorsAvailability = atomic.LoadUint64(&p.totalErrsAvailability) - diskMetric.TotalErrorsTimeout = atomic.LoadUint64(&p.totalErrsTimeout) return diskMetric, nil } }) m, _ := p.metricsCache.Get() - return m.(DiskMetrics) + diskMetric := DiskMetrics{} + if m != nil { + diskMetric = m.(DiskMetrics) + } + + // Do not need this value to be cached. + diskMetric.TotalErrorsTimeout = p.totalErrsTimeout.Load() + diskMetric.TotalErrorsAvailability = p.totalErrsAvailability.Load() + return diskMetric } // lockedLastMinuteLatency accumulates totals lockless for each second. @@ -746,9 +753,9 @@ func (p *xlStorageDiskIDCheck) updateStorageMetrics(s storageMetric, paths ...st context.DeadlineExceeded, context.Canceled, }...) { - atomic.AddUint64(&p.totalErrsAvailability, 1) + p.totalErrsAvailability.Add(1) if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { - atomic.AddUint64(&p.totalErrsTimeout, 1) + p.totalErrsTimeout.Add(1) } } p.apiLatencies[s].add(duration)