From 1e3fef6ab0578834a91728ef4fdd1532fa39913d Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Mon, 14 Aug 2023 15:39:25 +0100 Subject: [PATCH] scraping: limit detail on dropped targets, to save memory (#12647) It's possible (quite common on Kubernetes) to have a service discovery return thousands of targets then drop most of them in relabel rules. The main place this data is used is to display in the web UI, where you don't want thousands of lines of display. The new limit is `keep_dropped_targets`, which defaults to 0 for backwards-compatibility. Signed-off-by: Bryan Boreham --- config/config.go | 9 +++++++++ docs/configuration/configuration.md | 8 ++++++++ docs/querying/api.md | 1 + .../examples/prometheus-kubernetes.yml | 5 +++++ scrape/manager.go | 13 ++++++++++++- scrape/scrape.go | 18 +++++++++++++++--- scrape/scrape_test.go | 1 + web/api/v1/api.go | 9 +++++++-- web/api/v1/api_test.go | 11 +++++++++++ web/api/v1/errors_test.go | 5 +++++ .../src/pages/serviceDiscovery/Services.tsx | 19 ++++++++++--------- 11 files changed, 84 insertions(+), 15 deletions(-) diff --git a/config/config.go b/config/config.go index d32fcc33c9..7f7595dcdf 100644 --- a/config/config.go +++ b/config/config.go @@ -409,6 +409,9 @@ type GlobalConfig struct { // More than this label value length post metric-relabeling will cause the // scrape to fail. 0 means no limit. LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"` + // Keep no more than this many dropped targets per job. + // 0 means no limit. + KeepDroppedTargets uint `yaml:"keep_dropped_targets,omitempty"` } // SetDirectory joins any relative file paths with dir. @@ -514,6 +517,9 @@ type ScrapeConfig struct { // More than this many buckets in a native histogram will cause the scrape to // fail. NativeHistogramBucketLimit uint `yaml:"native_histogram_bucket_limit,omitempty"` + // Keep no more than this many dropped targets per job. + // 0 means no limit. + KeepDroppedTargets uint `yaml:"keep_dropped_targets,omitempty"` // We cannot do proper Go type embedding below as the parser will then parse // values arbitrarily into the overflow maps of further-down types. @@ -608,6 +614,9 @@ func (c *ScrapeConfig) Validate(globalConfig GlobalConfig) error { if c.LabelValueLengthLimit == 0 { c.LabelValueLengthLimit = globalConfig.LabelValueLengthLimit } + if c.KeepDroppedTargets == 0 { + c.KeepDroppedTargets = globalConfig.KeepDroppedTargets + } return nil } diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 6691902579..f15a9f914d 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -106,6 +106,10 @@ global: # change in the future. [ target_limit: | default = 0 ] + # Limit per scrape config on the number of targets dropped by relabeling + # that will be kept in memory. 0 means no limit. + [ keep_dropped_targets: | default = 0 ] + # Rule files specifies a list of globs. Rules and alerts are read from # all matching files. rule_files: @@ -415,6 +419,10 @@ metric_relabel_configs: # change in the future. [ target_limit: | default = 0 ] +# Per-job limit on the number of targets dropped by relabeling +# that will be kept in memory. 0 means no limit. +[ keep_dropped_targets: | default = 0 ] + # Limit on total number of positive and negative buckets allowed in a single # native histogram. If this is exceeded, the entire scrape will be treated as # failed. 0 means no limit. diff --git a/docs/querying/api.md b/docs/querying/api.md index 8ddb834ef7..408d32cdab 100644 --- a/docs/querying/api.md +++ b/docs/querying/api.md @@ -543,6 +543,7 @@ GET /api/v1/targets ``` Both the active and dropped targets are part of the response by default. +Dropped targets are subject to `keep_dropped_targets` limit, if set. `labels` represents the label set after relabeling has occurred. `discoveredLabels` represent the unmodified labels retrieved during service discovery before relabeling has occurred. diff --git a/documentation/examples/prometheus-kubernetes.yml b/documentation/examples/prometheus-kubernetes.yml index 9a62287342..ad7451c2d7 100644 --- a/documentation/examples/prometheus-kubernetes.yml +++ b/documentation/examples/prometheus-kubernetes.yml @@ -8,6 +8,11 @@ # If you are using Kubernetes 1.7.2 or earlier, please take note of the comments # for the kubernetes-cadvisor job; you will need to edit or remove this job. +# Keep at most 100 sets of details of targets dropped by relabeling. +# This information is used to display in the UI for troubleshooting. +global: + keep_dropped_targets: 100 + # Scrape config for API servers. # # Kubernetes exposes API servers as endpoints to the default/kubernetes diff --git a/scrape/manager.go b/scrape/manager.go index d7cf6792c2..427b9f2be1 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -357,7 +357,7 @@ func (m *Manager) TargetsActive() map[string][]*Target { return targets } -// TargetsDropped returns the dropped targets during relabelling. +// TargetsDropped returns the dropped targets during relabelling, subject to KeepDroppedTargets limit. func (m *Manager) TargetsDropped() map[string][]*Target { m.mtxScrape.Lock() defer m.mtxScrape.Unlock() @@ -368,3 +368,14 @@ func (m *Manager) TargetsDropped() map[string][]*Target { } return targets } + +func (m *Manager) TargetsDroppedCounts() map[string]int { + m.mtxScrape.Lock() + defer m.mtxScrape.Unlock() + + counts := make(map[string]int, len(m.scrapePools)) + for tset, sp := range m.scrapePools { + counts[tset] = sp.droppedTargetsCount + } + return counts +} diff --git a/scrape/scrape.go b/scrape/scrape.go index df729b4489..40836afc20 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -242,8 +242,9 @@ type scrapePool struct { targetMtx sync.Mutex // activeTargets and loops must always be synchronized to have the same // set of hashes. - activeTargets map[uint64]*Target - droppedTargets []*Target + activeTargets map[uint64]*Target + droppedTargets []*Target // Subject to KeepDroppedTargets limit. + droppedTargetsCount int // Count of all dropped targets. // Constructor for new scrape loops. This is settable for testing convenience. newLoop func(scrapeLoopOptions) loop @@ -354,12 +355,19 @@ func (sp *scrapePool) ActiveTargets() []*Target { return tActive } +// Return dropped targets, subject to KeepDroppedTargets limit. func (sp *scrapePool) DroppedTargets() []*Target { sp.targetMtx.Lock() defer sp.targetMtx.Unlock() return sp.droppedTargets } +func (sp *scrapePool) DroppedTargetsCount() int { + sp.targetMtx.Lock() + defer sp.targetMtx.Unlock() + return sp.droppedTargetsCount +} + // stop terminates all scrape loops and returns after they all terminated. func (sp *scrapePool) stop() { sp.mtx.Lock() @@ -506,6 +514,7 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { var targets []*Target lb := labels.NewBuilder(labels.EmptyLabels()) sp.droppedTargets = []*Target{} + sp.droppedTargetsCount = 0 for _, tg := range tgs { targets, failures := TargetsFromGroup(tg, sp.config, sp.noDefaultPort, targets, lb) for _, err := range failures { @@ -520,7 +529,10 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { case nonEmpty: all = append(all, t) case !t.discoveredLabels.IsEmpty(): - sp.droppedTargets = append(sp.droppedTargets, t) + if sp.config.KeepDroppedTargets != 0 && uint(len(sp.droppedTargets)) < sp.config.KeepDroppedTargets { + sp.droppedTargets = append(sp.droppedTargets, t) + } + sp.droppedTargetsCount++ } } } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 3f119b94dd..8578f1bec6 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -88,6 +88,7 @@ func TestDroppedTargetsList(t *testing.T) { SourceLabels: model.LabelNames{"job"}, }, }, + KeepDroppedTargets: 1, } tgs = []*targetgroup.Group{ { diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 99589ac46f..227027e462 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -100,6 +100,7 @@ type ScrapePoolsRetriever interface { type TargetRetriever interface { TargetsActive() map[string][]*scrape.Target TargetsDropped() map[string][]*scrape.Target + TargetsDroppedCounts() map[string]int } // AlertmanagerRetriever provides a list of all/dropped AlertManager URLs. @@ -898,8 +899,9 @@ type DroppedTarget struct { // TargetDiscovery has all the active targets. type TargetDiscovery struct { - ActiveTargets []*Target `json:"activeTargets"` - DroppedTargets []*DroppedTarget `json:"droppedTargets"` + ActiveTargets []*Target `json:"activeTargets"` + DroppedTargets []*DroppedTarget `json:"droppedTargets"` + DroppedTargetCounts map[string]int `json:"droppedTargetCounts"` } // GlobalURLOptions contains fields used for deriving the global URL for local targets. @@ -1039,6 +1041,9 @@ func (api *API) targets(r *http.Request) apiFuncResult { } else { res.ActiveTargets = []*Target{} } + if showDropped { + res.DroppedTargetCounts = api.targetRetriever(r.Context()).TargetsDroppedCounts() + } if showDropped { targetsDropped := api.targetRetriever(r.Context()).TargetsDropped() droppedKeys, numTargets := sortKeys(targetsDropped) diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 99e3b292e8..742ca09ba6 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -137,6 +137,14 @@ func (t testTargetRetriever) TargetsDropped() map[string][]*scrape.Target { return t.droppedTargets } +func (t testTargetRetriever) TargetsDroppedCounts() map[string]int { + r := make(map[string]int) + for k, v := range t.droppedTargets { + r[k] = len(v) + } + return r +} + func (t *testTargetRetriever) SetMetadataStoreForTargets(identifier string, metadata scrape.MetricMetadataStore) error { targets, ok := t.activeTargets[identifier] @@ -1384,6 +1392,7 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E }, }, }, + DroppedTargetCounts: map[string]int{"blackbox": 1}, }, }, { @@ -1436,6 +1445,7 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E }, }, }, + DroppedTargetCounts: map[string]int{"blackbox": 1}, }, }, { @@ -1498,6 +1508,7 @@ func testEndpoints(t *testing.T, api *API, tr *testTargetRetriever, es storage.E }, }, }, + DroppedTargetCounts: map[string]int{"blackbox": 1}, }, }, // With a matching metric. diff --git a/web/api/v1/errors_test.go b/web/api/v1/errors_test.go index afdd673375..8d194a0581 100644 --- a/web/api/v1/errors_test.go +++ b/web/api/v1/errors_test.go @@ -229,6 +229,11 @@ func (DummyTargetRetriever) TargetsDropped() map[string][]*scrape.Target { return map[string][]*scrape.Target{} } +// TargetsDroppedCounts implements targetRetriever. +func (DummyTargetRetriever) TargetsDroppedCounts() map[string]int { + return nil +} + // DummyAlertmanagerRetriever implements AlertmanagerRetriever. type DummyAlertmanagerRetriever struct{} diff --git a/web/ui/react-app/src/pages/serviceDiscovery/Services.tsx b/web/ui/react-app/src/pages/serviceDiscovery/Services.tsx index 21bf2259b9..79d88fbe4f 100644 --- a/web/ui/react-app/src/pages/serviceDiscovery/Services.tsx +++ b/web/ui/react-app/src/pages/serviceDiscovery/Services.tsx @@ -14,6 +14,7 @@ import SearchBar from '../../components/SearchBar'; interface ServiceMap { activeTargets: Target[]; droppedTargets: DroppedTarget[]; + droppedTargetCounts: Record; } export interface TargetLabels { @@ -34,7 +35,7 @@ const droppedTargetKVSearch = new KVSearch({ export const processSummary = ( activeTargets: Target[], - droppedTargets: DroppedTarget[] + droppedTargetCounts: Record ): Record => { const targets: Record = {}; @@ -50,15 +51,15 @@ export const processSummary = ( targets[name].total++; targets[name].active++; } - for (const target of droppedTargets) { - const { job: name } = target.discoveredLabels; + for (const name in targets) { if (!targets[name]) { targets[name] = { - total: 0, + total: droppedTargetCounts[name], active: 0, }; + } else { + targets[name].total += droppedTargetCounts[name]; } - targets[name].total++; } return targets; @@ -94,10 +95,10 @@ export const processTargets = (activeTargets: Target[], droppedTargets: DroppedT return labels; }; -export const ServiceDiscoveryContent: FC = ({ activeTargets, droppedTargets }) => { +export const ServiceDiscoveryContent: FC = ({ activeTargets, droppedTargets, droppedTargetCounts }) => { const [activeTargetList, setActiveTargetList] = useState(activeTargets); const [droppedTargetList, setDroppedTargetList] = useState(droppedTargets); - const [targetList, setTargetList] = useState(processSummary(activeTargets, droppedTargets)); + const [targetList, setTargetList] = useState(processSummary(activeTargets, droppedTargetCounts)); const [labelList, setLabelList] = useState(processTargets(activeTargets, droppedTargets)); const handleSearchChange = useCallback( @@ -118,9 +119,9 @@ export const ServiceDiscoveryContent: FC = ({ activeTargets, dropped const defaultValue = useMemo(getQuerySearchFilter, []); useEffect(() => { - setTargetList(processSummary(activeTargetList, droppedTargetList)); + setTargetList(processSummary(activeTargetList, droppedTargetCounts)); setLabelList(processTargets(activeTargetList, droppedTargetList)); - }, [activeTargetList, droppedTargetList]); + }, [activeTargetList, droppedTargetList, droppedTargetCounts]); return ( <>