diff --git a/promql/engine.go b/promql/engine.go
index 1e01f21db6..3ea663e2f9 100644
--- a/promql/engine.go
+++ b/promql/engine.go
@@ -2998,7 +2998,6 @@ type groupedAggregation struct {
 	hasHistogram           bool // Has at least 1 histogram sample aggregated.
 	incompatibleHistograms bool // If true, group has seen mixed exponential and custom buckets, or incompatible custom buckets.
 	groupAggrComplete      bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group.
-	incrementalMean        bool // True after reverting to incremental calculation of the mean value.
 }
 
 // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix.
@@ -3097,6 +3096,21 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 			}
 
 		case parser.AVG:
+			// For the average calculation, we use incremental mean
+			// calculation. In particular in combination with Kahan
+			// summation (which we do for floats, but not yet for
+			// histograms, see issue #14105), this is quite accurate
+			// and only breaks in extreme cases (see testdata for
+			// avg_over_time). One might assume that simple direct
+			// mean calculation works better in some cases, but so
+			// far, our conclusion is that we fare best with the
+			// incremental approach plus Kahan summation (for
+			// floats). For a relevant discussion, see
+			// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
+			// Additional note: For even better numerical accuracy,
+			// we would need to process the values in a particular
+			// order, but that would be very hard to implement given
+			// how the PromQL engine works.
 			group.groupCount++
 			if h != nil {
 				group.hasHistogram = true
@@ -3121,45 +3135,11 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				// point in copying the histogram in that case.
 			} else {
 				group.hasFloat = true
-				if !group.incrementalMean {
-					newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC)
-					if !math.IsInf(newV, 0) {
-						// The sum doesn't overflow, so we propagate it to the
-						// group struct and continue with the regular
-						// calculation of the mean value.
-						group.floatValue, group.floatKahanC = newV, newC
-						break
-					}
-					// If we are here, we know that the sum _would_ overflow. So
-					// instead of continue to sum up, we revert to incremental
-					// calculation of the mean value from here on.
-					group.incrementalMean = true
-					group.floatMean = group.floatValue / (group.groupCount - 1)
-					group.floatKahanC /= group.groupCount - 1
-				}
-				if math.IsInf(group.floatMean, 0) {
-					if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) {
-						// The `floatMean` and `s.F` values are `Inf` of the same sign.  They
-						// can't be subtracted, but the value of `floatMean` is correct
-						// already.
-						break
-					}
-					if !math.IsInf(f, 0) && !math.IsNaN(f) {
-						// At this stage, the mean is an infinite. If the added
-						// value is neither an Inf or a Nan, we can keep that mean
-						// value.
-						// This is required because our calculation below removes
-						// the mean value, which would look like Inf += x - Inf and
-						// end up as a NaN.
-						break
-					}
-				}
-				currentMean := group.floatMean + group.floatKahanC
+				q := (group.groupCount - 1) / group.groupCount
 				group.floatMean, group.floatKahanC = kahanSumInc(
-					// Divide each side of the `-` by `group.groupCount` to avoid float64 overflows.
-					f/group.groupCount-currentMean/group.groupCount,
-					group.floatMean,
-					group.floatKahanC,
+					f/group.groupCount,
+					q*group.floatMean,
+					q*group.floatKahanC,
 				)
 			}
 
@@ -3232,10 +3212,8 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix
 				continue
 			case aggr.hasHistogram:
 				aggr.histogramValue = aggr.histogramValue.Compact(0)
-			case aggr.incrementalMean:
-				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
 			default:
-				aggr.floatValue = (aggr.floatValue + aggr.floatKahanC) / aggr.groupCount
+				aggr.floatValue = aggr.floatMean + aggr.floatKahanC
 			}
 
 		case parser.COUNT:
diff --git a/promql/functions.go b/promql/functions.go
index e4699f3914..7e5205844a 100644
--- a/promql/functions.go
+++ b/promql/functions.go
@@ -671,6 +671,20 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		metricName := firstSeries.Metric.Get(labels.MetricName)
 		return enh.Out, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, args[0].PositionRange()))
 	}
+	// For the average calculation, we use incremental mean calculation. In
+	// particular in combination with Kahan summation (which we do for
+	// floats, but not yet for histograms, see issue #14105), this is quite
+	// accurate and only breaks in extreme cases (see testdata). One might
+	// assume that simple direct mean calculation works better in some
+	// cases, but so far, our conclusion is that we fare best with the
+	// incremental approach plus Kahan summation (for floats). For a
+	// relevant discussion, see
+	// https://stackoverflow.com/questions/61665473/is-it-beneficial-for-precision-to-calculate-the-incremental-mean-average
+	// Additional note: For even better numerical accuracy, we would need to
+	// process the values in a particular order. For avg_over_time, that
+	// would be more or less feasible, but it would be more expensivo, and
+	// it would also be much harder for the avg aggregator, given how the
+	// PromQL engine works.
 	if len(firstSeries.Floats) == 0 {
 		// The passed values only contain histograms.
 		vec, err := aggrHistOverTime(vals, enh, func(s Series) (*histogram.FloatHistogram, error) {
@@ -702,52 +716,13 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode
 		return vec, nil
 	}
 	return aggrOverTime(vals, enh, func(s Series) float64 {
-		var (
-			sum, mean, count, kahanC float64
-			incrementalMean          bool
-		)
-		for _, f := range s.Floats {
-			count++
-			if !incrementalMean {
-				newSum, newC := kahanSumInc(f.F, sum, kahanC)
-				// Perform regular mean calculation as long as
-				// the sum doesn't overflow and (in any case)
-				// for the first iteration (even if we start
-				// with ±Inf) to not run into division-by-zero
-				// problems below.
-				if count == 1 || !math.IsInf(newSum, 0) {
-					sum, kahanC = newSum, newC
-					continue
-				}
-				// Handle overflow by reverting to incremental calculation of the mean value.
-				incrementalMean = true
-				mean = sum / (count - 1)
-				kahanC /= count - 1
-			}
-			if math.IsInf(mean, 0) {
-				if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) {
-					// The `mean` and `f.F` values are `Inf` of the same sign.  They
-					// can't be subtracted, but the value of `mean` is correct
-					// already.
-					continue
-				}
-				if !math.IsInf(f.F, 0) && !math.IsNaN(f.F) {
-					// At this stage, the mean is an infinite. If the added
-					// value is neither an Inf or a Nan, we can keep that mean
-					// value.
-					// This is required because our calculation below removes
-					// the mean value, which would look like Inf += x - Inf and
-					// end up as a NaN.
-					continue
-				}
-			}
-			correctedMean := mean + kahanC
-			mean, kahanC = kahanSumInc(f.F/count-correctedMean/count, mean, kahanC)
+		var mean, kahanC float64
+		for i, f := range s.Floats {
+			count := float64(i + 1)
+			q := float64(i) / count
+			mean, kahanC = kahanSumInc(f.F/count, q*mean, q*kahanC)
 		}
-		if incrementalMean {
-			return mean + kahanC
-		}
-		return (sum + kahanC) / count
+		return mean + kahanC
 	}), nil
 }
 
diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test
index 83c49320ed..beb216b93c 100644
--- a/promql/promqltest/testdata/functions.test
+++ b/promql/promqltest/testdata/functions.test
@@ -1013,6 +1013,79 @@ eval instant at 1m sum_over_time(metric[2m])
 eval instant at 1m avg_over_time(metric[2m])
   {} 0.5
 
+# More tests for extreme values.
+clear
+# All positive values with varying magnitudes.
+load 5s
+  metric1 1e10 1e-6 1e-6 1e-6 1e-6 1e-6
+  metric2 5.30921651659898 0.961118537914768 1.62091361305318 0.865089463758091 0.323055185914577 0.951811357687154
+  metric3 1.78264e50 0.76342771 1.9592258 7.69805412 458.90154
+  metric4 0.76342771 1.9592258 7.69805412 1.78264e50 458.90154
+  metric5 1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 458.90154
+
+eval instant at 55s avg_over_time(metric1[1m])
+  {} 1.6666666666666675e+09
+
+eval instant at 55s avg_over_time(metric2[1m])
+  {} 1.67186744582113
+  
+eval instant at 55s avg_over_time(metric3[1m])
+  {} 3.56528E+49
+  
+eval instant at 55s avg_over_time(metric4[1m])
+  {} 3.56528E+49
+  
+eval instant at 55s avg_over_time(metric5[1m])
+  {} 3.76333333333333E+219
+  
+# Contains negative values; result is dominated by a very large value.
+load 5s
+  metric6 -1.78264E+50 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154
+  metric7 -1.78264E+50 0.76342771 1.9592258 -2.258E+220 7.69805412 -458.90154
+  metric8 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 -458.90154
+  metric9 -1.78264E+215 0.76342771 1.9592258 2.258E+220 7.69805412 1.78264E+215 -458.90154
+  metric10 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
+
+eval instant at 55s avg_over_time(metric6[1m])
+  {} 3.76333333333333E+219
+  
+eval instant at 55s avg_over_time(metric7[1m])
+  {} -3.76333333333333E+219
+  
+eval instant at 55s avg_over_time(metric8[1m])
+  {} 3.76330362266667E+219
+
+eval instant at 55s avg_over_time(metric9[1m])
+  {} 3.225714285714286e+219
+
+# Interestingly, before PR #16569, this test failed with a result of
+# 3.225714285714286e+219, so the incremental calculation combined with
+# Kahan summation (in the improved way as introduced by PR #16569)
+# seems to be more accurate than the simple mean calculation with
+# Kahan summation.
+eval instant at 55s avg_over_time(metric10[1m])
+  {} 2.5088888888888888e+219
+
+# Large values of different magnitude, combined with small values. The
+# large values, however, all cancel each other exactly, so that the actual
+# average here is determined by only the small values. Therefore, the correct
+# outcome is -44.848083237000004.
+load 5s
+  metric11 -2.258E+220 -1.78264E+219 0.76342771 1.9592258 2.3757689E+217 -2.3757689E+217 2.258E+220 7.69805412 1.78264E+219 -458.90154
+
+# Thus, the result here is very far off! With the different orders of
+# magnitudes involved, even Kahan summation cannot cope.
+# Interestingly, with the simple mean calculation (still including
+# Kahan summation) prior to PR #16569, the result is 0. That's
+# arguably more accurate, but it still shows that Kahan summation
+# doesn't work well in this situation. To solve this properly, we
+# needed to do something like sorting the values (which is hard given
+# how the PromQL engine works). The question is how practically
+# relevant this scenario is.
+eval instant at 55s avg_over_time(metric11[1m])
+  {} -1.881783551706252e+203
+# {} -44.848083237000004 <- This is the correct value.
+
 # Test per-series aggregation on dense samples.
 clear
 load 1ms