prometheus/model/textparse/promparse.go
György Krajcsovits b3edeaad7d Rework with state functions and curr+next
The idea is to load metrics into curr, but if we see a change in metric
name or labels we load them into next , return curr as detected and
continue from next.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
2024-10-07 10:49:36 +02:00

826 lines
21 KiB
Go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:generate go get -u modernc.org/golex
//go:generate golex -o=promlex.l.go promlex.l
package textparse
import (
"bytes"
"errors"
"fmt"
"io"
// "math"
"strconv"
"strings"
// "unicode/utf8"
"unsafe"
//"github.com/prometheus/common/model"
// "github.com/prometheus/prometheus/model/exemplar"
// "github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
//"github.com/prometheus/prometheus/model/value"
)
type promlexer struct {
b []byte
i int
start int
err error
state int
}
type token int
const (
tInvalid token = -1
tEOF token = 0
tLinebreak token = iota
tWhitespace
tHelp
tType
tUnit
tEOFWord
tText
tComment
tBlank
tMName
tQString
tBraceOpen
tBraceClose
tLName
tLValue
tComma
tEqual
tTimestamp
tValue
)
func (t token) String() string {
switch t {
case tInvalid:
return "INVALID"
case tEOF:
return "EOF"
case tLinebreak:
return "LINEBREAK"
case tWhitespace:
return "WHITESPACE"
case tHelp:
return "HELP"
case tType:
return "TYPE"
case tUnit:
return "UNIT"
case tEOFWord:
return "EOFWORD"
case tText:
return "TEXT"
case tComment:
return "COMMENT"
case tBlank:
return "BLANK"
case tMName:
return "MNAME"
case tQString:
return "QSTRING"
case tBraceOpen:
return "BOPEN"
case tBraceClose:
return "BCLOSE"
case tLName:
return "LNAME"
case tLValue:
return "LVALUE"
case tEqual:
return "EQUAL"
case tComma:
return "COMMA"
case tTimestamp:
return "TIMESTAMP"
case tValue:
return "VALUE"
}
return fmt.Sprintf("<invalid: %d>", t)
}
// buf returns the buffer of the current token.
func (l *promlexer) buf() []byte {
return l.b[l.start:l.i]
}
func (l *promlexer) cur() byte {
return l.b[l.i]
}
// next advances the promlexer to the next character.
func (l *promlexer) next() byte {
l.i++
if l.i >= len(l.b) {
l.err = io.EOF
return byte(tEOF)
}
// Lex struggles with null bytes. If we are in a label value or help string, where
// they are allowed, consume them here immediately.
for l.b[l.i] == 0 && (l.state == sLValue || l.state == sMeta2 || l.state == sComment) {
l.i++
}
return l.b[l.i]
}
func (l *promlexer) Error(es string) {
l.err = errors.New(es)
}
// A stateFn is a function that represents a state in a state machine. By
// executing it, the state is progressed to the next state. The stateFn returns
// another stateFn, which represents the new state. The returned bool indicates
// that a metric is found and the Parser.Next() can return. The error is set if
// an error occurred.
type stateFn func() (stateFn, bool, error)
type exposedValue struct {
// Meta.
nameStart int
nameEnd int
detectedType detectedType
helpStart int
helpEnd int
// Values.
hasTimestamp bool
timestamp int64
hasSumValue bool
sumValue float64
hasCountValue bool
countValue float64
//hasInfValue bool
}
// PromParser parses samples from a byte slice of samples in the official
// Prometheus text exposition format.
type PromParser struct {
l *promlexer
// builder labels.ScratchBuilder
// series []byte
// text []byte
// mtype model.MetricType
// val float64
// ts int64
// hasTS bool
// start int
// offsets is a list of offsets into series that describe the positions
// of the metric name and label names and values for this series.
// p.offsets[0] is the start character of the metric name.
// p.offsets[1] is the end of the metric name.
// Subsequently, p.offsets is a pair of pair of offsets for the positions
// of the label name and value start and end characters.
//offsets []int
state stateFn
// Use these two struct to store the detected metric values.
// One is always the current metric, the other will be used to store the next metric.
ev1 exposedValue
ev2 exposedValue
curr *exposedValue
next *exposedValue
// Cached interface objects.
exposedCounterMetric promFloat
}
const (
stateStart = iota
stateFoundMetric
stateError
)
type detectedType int
const (
detectedUntyped detectedType = iota
detectedCounter
detectedGauge
detectedHistogram
detectedSummary
)
type promBase struct {
buffer []byte
ev *exposedValue
}
func (p promBase) Name() string {
return yoloString(p.buffer[p.ev.nameStart:p.ev.nameEnd])
}
func (p promBase) Help() (string, bool) {
if p.ev.helpStart >= p.ev.helpEnd {
return "", false
}
return yoloString(p.buffer[p.ev.helpStart:p.ev.helpEnd]), true
}
type promFloat struct {
promBase
}
func (p promFloat) Value() float64 {
return p.ev.sumValue
}
// NewPromParser returns a new parser of the byte slice.
func NewPromParser(b []byte, st *labels.SymbolTable) Parser {
p := &PromParser{
l: &promlexer{b: append(b, '\n')},
//builder: labels.NewScratchBuilderWithSymbolTable(st, 16),
}
p.state = p.startState
p.curr = &p.ev1
p.next = &p.ev2
p.exposedCounterMetric = promFloat{promBase{buffer: p.l.b}}
return p
}
func (p *PromParser) Next(d DropperCache, keepClassicHistogramSeries bool) (interface{}, error) {
for {
var err error
var found bool
p.state, found, err = p.state()
if err != nil {
return nil, err
}
if found {
p.exposedCounterMetric.ev = p.curr
return p.exposedCounterMetric, nil
}
}
}
func (p* PromParser) startState() (stateFn, bool, error) {
// Get the next token.
t := p.nextToken()
// Shorthand to the lexer.
l := p.l
// Shorthand for the current and next metric.
curr := p.curr
next := p.next
switch t {
case tInvalid:
return nil, false, fmt.Errorf("invalid token")
case tEOF:
return p.eofState, p.isValid(), nil
case tLinebreak:
// Allow full blank lines.
return p.startState, false, nil
case tWhitespace:
// Skip whitespace.
return p.startState, false, nil
case tHelp:
// Try to store the help text.
t = p.nextToken()
if t != tMName {
// Next token wasn't a metric name as expected. Skip.
t = l.consumeComment()
if t == tEOF {
return p.eofState, p.isValid(), nil
}
return p.startState, false, nil
}
// Remember the position of the name for later.
nameStart := p.l.start
nameEnd := p.l.i
t = p.nextToken()
if t == tEOF {
return p.eofState, p.isValid(), nil
}
if t != tText {
// We are supposed to have text here.
return nil, false, fmt.Errorf("expected text")
}
// Check if we have a metric name already.
if curr.nameStart != curr.nameEnd {
if !bytes.Equal(l.b[nameStart:nameEnd], l.b[curr.nameStart:curr.nameEnd]) {
// The metric name in the help text does not match the metric name for the type.
next.nameStart = nameStart
next.nameEnd = nameEnd
return p.resetState, p.isValid(), nil
}
} else {
// Store the metric name for later.
curr.nameStart = nameStart
curr.nameEnd = nameEnd
}
curr.helpStart = p.l.start+1
curr.helpEnd = p.l.i
return p.startState, false, nil
case tType:
// Try to store the type.
t = p.nextToken()
if t != tMName {
// Next token wasn't a metric name as expected. Skip.
t = l.consumeComment()
if t == tEOF {
return p.eofState, p.isValid(), nil
}
return p.startState, false, nil
}
// Remember the position of the name for later.
nameStart := p.l.start
nameEnd := p.l.i
// Get the type.
t = p.nextToken()
if t != tText {
// We are supposed to have text here.
return nil, false, fmt.Errorf("expected text")
}
switch s := string(l.b[l.start+1:l.i]); s {
case "counter":
curr.detectedType = detectedCounter
case "gauge":
curr.detectedType = detectedGauge
case "histogram":
curr.detectedType = detectedHistogram
case "summary":
curr.detectedType = detectedSummary
case "untyped":
curr.detectedType = detectedUntyped
default:
// We don't know this type. Skip.
return p.startState, false, nil
}
// Check if we have a metric name already.
if curr.nameStart != curr.nameEnd {
if !bytes.Equal(l.b[nameStart:nameEnd], l.b[curr.nameStart:curr.nameEnd]) {
// The metric name in the help text does not match the metric name for the type.
next.nameStart = nameStart
next.nameEnd = nameEnd
return p.resetState, p.isValid(), nil
}
} else {
// Store the metric name for later.
curr.nameStart = nameStart
curr.nameEnd = nameEnd
}
return p.startState, false, nil
case tMName:
return p.seriesState()
}
return nil, false, fmt.Errorf("unhandled state")
}
func (p* PromParser) isValid() bool {
curr := p.curr
switch curr.detectedType {
case detectedGauge:
return curr.hasSumValue
default:
return false
}
}
func (p *PromParser) eofState() (stateFn, bool, error) {
return nil, false, io.EOF
}
func (p *PromParser) resetState() (stateFn, bool, error) {
// The current values as no longer used, reset them and switch out with next.
curr := p.curr
curr.hasSumValue = false
curr.hasCountValue = false
curr.detectedType = detectedUntyped
curr.nameStart = curr.nameEnd
curr.helpStart = curr.helpEnd
p.curr = p.next
p.next = curr
return p.startState, false, nil
}
// This is the most complicated bit. We have to prepare for the happy case where we continue
// a metric family and also for the case where we start a new metric family.
func (p *PromParser) seriesState() (stateFn, bool, error) {
var err error
// Remember the position of the name for later.
nameStart := p.l.start
nameEnd := p.l.i
// Shorthand to the lexer.
l := p.l
// Shorthand for the current and next metric.
curr := p.curr
next := p.next
switch curr.detectedType {
case detectedGauge:
// The name should match the current metric name.
if !bytes.Equal(l.b[nameStart:nameEnd], l.b[curr.nameStart:curr.nameEnd]) {
// The metric name in the help text does not match the metric name for the type.
next.nameStart = nameStart
next.nameEnd = nameEnd
return p.retrySeries, p.isValid(), nil
}
// We have a gauge, try to parse the value.
t := p.nextToken()
if t != tValue {
// We are supposed to have a value here.
return nil, false, fmt.Errorf("expected value")
}
// Parse the value.
curr.sumValue, err = parseFloat(yoloString(l.buf()))
if err != nil {
return nil, false, fmt.Errorf("%w while parsing: %q", err, l.buf())
}
curr.hasSumValue = true
// Check if we have a timestamp.
switch p.nextToken() {
case tTimestamp:
curr.timestamp, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64)
if err != nil {
return nil, false, fmt.Errorf("%w while parsing: %q", err, l.buf())
}
curr.hasTimestamp = true
fallthrough
case tLinebreak:
// Done with this line.
return p.resetState, curr.hasSumValue, nil
default:
// Consume the rest of the line.
if l.consumeComment() == tEOF {
return p.eofState, curr.hasSumValue, nil
}
return p.resetState, curr.hasSumValue, nil
}
default:
return nil, false, fmt.Errorf("unhandled metric type")
}
}
func (p *PromParser) retrySeries() (stateFn, bool, error) {
// Switch out the current and next metric, but start from the seriesState and not the startState.
_, _, _ = p.resetState()
return p.seriesState, false, nil
}
// nextToken returns the next token from the promlexer. It skips over tabs
// and spaces.
func (p *PromParser) nextToken() token {
for {
if tok := p.l.Lex(); tok != tWhitespace {
return tok
}
}
}
/*
// Series returns the bytes of the series, the timestamp if set, and the value
// of the current sample.
func (p *PromParser) Series() ([]byte, *int64, float64) {
if p.hasTS {
return p.series, &p.ts, p.val
}
return p.series, nil, p.val
}
// Histogram returns (nil, nil, nil, nil) for now because the Prometheus text
// format does not support sparse histograms yet.
func (p *PromParser) Histogram() ([]byte, *int64, *histogram.Histogram, *histogram.FloatHistogram) {
return nil, nil, nil, nil
}
// Help returns the metric name and help text in the current entry.
// Must only be called after Next returned a help entry.
// The returned byte slices become invalid after the next call to Next.
func (p *PromParser) Help() ([]byte, []byte) {
m := p.l.b[p.offsets[0]:p.offsets[1]]
// Replacer causes allocations. Replace only when necessary.
if strings.IndexByte(yoloString(p.text), byte('\\')) >= 0 {
return m, []byte(helpReplacer.Replace(string(p.text)))
}
return m, p.text
}
// Type returns the metric name and type in the current entry.
// Must only be called after Next returned a type entry.
// The returned byte slices become invalid after the next call to Next.
func (p *PromParser) Type() ([]byte, model.MetricType) {
return p.l.b[p.offsets[0]:p.offsets[1]], p.mtype
}
// Unit returns the metric name and unit in the current entry.
// Must only be called after Next returned a unit entry.
// The returned byte slices become invalid after the next call to Next.
func (p *PromParser) Unit() ([]byte, []byte) {
// The Prometheus format does not have units.
return nil, nil
}
// Comment returns the text of the current comment.
// Must only be called after Next returned a comment entry.
// The returned byte slice becomes invalid after the next call to Next.
func (p *PromParser) Comment() []byte {
return p.text
}
// Metric writes the labels of the current sample into the passed labels.
// It returns the string from which the metric was parsed.
func (p *PromParser) Metric(l *labels.Labels) string {
// Copy the buffer to a string: this is only necessary for the return value.
s := string(p.series)
p.builder.Reset()
metricName := unreplace(s[p.offsets[0]-p.start : p.offsets[1]-p.start])
p.builder.Add(labels.MetricName, metricName)
for i := 2; i < len(p.offsets); i += 4 {
a := p.offsets[i] - p.start
b := p.offsets[i+1] - p.start
label := unreplace(s[a:b])
c := p.offsets[i+2] - p.start
d := p.offsets[i+3] - p.start
value := unreplace(s[c:d])
p.builder.Add(label, value)
}
p.builder.Sort()
*l = p.builder.Labels()
return s
}
// Exemplar implements the Parser interface. However, since the classic
// Prometheus text format does not support exemplars, this implementation simply
// returns false and does nothing else.
func (p *PromParser) Exemplar(*exemplar.Exemplar) bool {
return false
}
// CreatedTimestamp returns nil as it's not implemented yet.
// TODO(bwplotka): https://github.com/prometheus/prometheus/issues/12980
func (p *PromParser) CreatedTimestamp() *int64 {
return nil
}
func (p *PromParser) parseError(exp string, got token) error {
e := p.l.i + 1
if len(p.l.b) < e {
e = len(p.l.b)
}
return fmt.Errorf("%s, got %q (%q) while parsing: %q", exp, p.l.b[p.l.start:e], got, p.l.b[p.start:e])
}
// Next advances the parser to the next sample.
// It returns (EntryInvalid, io.EOF) if no samples were read.
func (p *PromParser) Next() (Entry, error) {
var err error
p.start = p.l.i
p.offsets = p.offsets[:0]
switch t := p.nextToken(); t {
case tEOF:
return EntryInvalid, io.EOF
case tLinebreak:
// Allow full blank lines.
return p.Next()
case tHelp, tType:
switch t2 := p.nextToken(); t2 {
case tMName:
mStart := p.l.start
mEnd := p.l.i
if p.l.b[mStart] == '"' && p.l.b[mEnd-1] == '"' {
mStart++
mEnd--
}
p.offsets = append(p.offsets, mStart, mEnd)
default:
return EntryInvalid, p.parseError("expected metric name after "+t.String(), t2)
}
switch t2 := p.nextToken(); t2 {
case tText:
if len(p.l.buf()) > 1 {
p.text = p.l.buf()[1:]
} else {
p.text = []byte{}
}
default:
return EntryInvalid, fmt.Errorf("expected text in %s, got %v", t.String(), t2.String())
}
switch t {
case tType:
switch s := yoloString(p.text); s {
case "counter":
p.mtype = model.MetricTypeCounter
case "gauge":
p.mtype = model.MetricTypeGauge
case "histogram":
p.mtype = model.MetricTypeHistogram
case "summary":
p.mtype = model.MetricTypeSummary
case "untyped":
p.mtype = model.MetricTypeUnknown
default:
return EntryInvalid, fmt.Errorf("invalid metric type %q", s)
}
case tHelp:
if !utf8.Valid(p.text) {
return EntryInvalid, fmt.Errorf("help text %q is not a valid utf8 string", p.text)
}
}
if t := p.nextToken(); t != tLinebreak {
return EntryInvalid, p.parseError("linebreak expected after metadata", t)
}
switch t {
case tHelp:
return EntryHelp, nil
case tType:
return EntryType, nil
}
case tComment:
p.text = p.l.buf()
if t := p.nextToken(); t != tLinebreak {
return EntryInvalid, p.parseError("linebreak expected after comment", t)
}
return EntryComment, nil
case tBraceOpen:
// We found a brace, so make room for the eventual metric name. If these
// values aren't updated, then the metric name was not set inside the
// braces and we can return an error.
if len(p.offsets) == 0 {
p.offsets = []int{-1, -1}
}
if err := p.parseLVals(); err != nil {
return EntryInvalid, err
}
p.series = p.l.b[p.start:p.l.i]
return p.parseMetricSuffix(p.nextToken())
case tMName:
p.offsets = append(p.offsets, p.start, p.l.i)
p.series = p.l.b[p.start:p.l.i]
t2 := p.nextToken()
// If there's a brace, consume and parse the label values.
if t2 == tBraceOpen {
if err := p.parseLVals(); err != nil {
return EntryInvalid, err
}
p.series = p.l.b[p.start:p.l.i]
t2 = p.nextToken()
}
return p.parseMetricSuffix(t2)
default:
err = p.parseError("expected a valid start token", t)
}
return EntryInvalid, err
}
// parseLVals parses the contents inside the braces.
func (p *PromParser) parseLVals() error {
t := p.nextToken()
for {
curTStart := p.l.start
curTI := p.l.i
switch t {
case tBraceClose:
return nil
case tLName:
case tQString:
default:
return p.parseError("expected label name", t)
}
t = p.nextToken()
// A quoted string followed by a comma or brace is a metric name. Set the
// offsets and continue processing.
if t == tComma || t == tBraceClose {
if p.offsets[0] != -1 || p.offsets[1] != -1 {
return fmt.Errorf("metric name already set while parsing: %q", p.l.b[p.start:p.l.i])
}
p.offsets[0] = curTStart + 1
p.offsets[1] = curTI - 1
if t == tBraceClose {
return nil
}
t = p.nextToken()
continue
}
// We have a label name, and it might be quoted.
if p.l.b[curTStart] == '"' {
curTStart++
curTI--
}
p.offsets = append(p.offsets, curTStart, curTI)
if t != tEqual {
return p.parseError("expected equal", t)
}
if t := p.nextToken(); t != tLValue {
return p.parseError("expected label value", t)
}
if !utf8.Valid(p.l.buf()) {
return fmt.Errorf("invalid UTF-8 label value: %q", p.l.buf())
}
// The promlexer ensures the value string is quoted. Strip first
// and last character.
p.offsets = append(p.offsets, p.l.start+1, p.l.i-1)
// Free trailing commas are allowed. NOTE: this allows spaces between label
// names, unlike in OpenMetrics. It is not clear if this is intended or an
// accidental bug.
if t = p.nextToken(); t == tComma {
t = p.nextToken()
}
}
}
// parseMetricSuffix parses the end of the line after the metric name and
// labels. It starts parsing with the provided token.
func (p *PromParser) parseMetricSuffix(t token) (Entry, error) {
if p.offsets[0] == -1 {
return EntryInvalid, fmt.Errorf("metric name not set while parsing: %q", p.l.b[p.start:p.l.i])
}
if t != tValue {
return EntryInvalid, p.parseError("expected value after metric", t)
}
var err error
if p.val, err = parseFloat(yoloString(p.l.buf())); err != nil {
return EntryInvalid, fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
}
// Ensure canonical NaN value.
if math.IsNaN(p.val) {
p.val = math.Float64frombits(value.NormalNaN)
}
p.hasTS = false
switch t := p.nextToken(); t {
case tLinebreak:
break
case tTimestamp:
p.hasTS = true
if p.ts, err = strconv.ParseInt(yoloString(p.l.buf()), 10, 64); err != nil {
return EntryInvalid, fmt.Errorf("%w while parsing: %q", err, p.l.b[p.start:p.l.i])
}
if t2 := p.nextToken(); t2 != tLinebreak {
return EntryInvalid, p.parseError("expected next entry after timestamp", t2)
}
default:
return EntryInvalid, p.parseError("expected timestamp or new record", t)
}
return EntrySeries, nil
}
var lvalReplacer = strings.NewReplacer(
`\"`, "\"",
`\\`, "\\",
`\n`, "\n",
)
var helpReplacer = strings.NewReplacer(
`\\`, "\\",
`\n`, "\n",
)
func unreplace(s string) string {
// Replacer causes allocations. Replace only when necessary.
if strings.IndexByte(s, byte('\\')) >= 0 {
return lvalReplacer.Replace(s)
}
return s
}
*/
func parseFloat(s string) (float64, error) {
// Keep to pre-Go 1.13 float formats.
if strings.ContainsAny(s, "pP_") {
return 0, fmt.Errorf("unsupported character in float")
}
return strconv.ParseFloat(s, 64)
}
func yoloString(b []byte) string {
return unsafe.String(unsafe.SliceData(b), len(b))
}