Compare commits

...

2 Commits

Author SHA1 Message Date
Ashwin Bhatkal
e0cc7300e6 fix: guard widgets against undefined in handleRowCollapse 2026-03-24 20:11:47 +05:30
Pandey
234716df53 fix(querier): return proper HTTP status for PromQL timeout errors (#10689)
Some checks are pending
build-staging / prepare (push) Waiting to run
build-staging / js-build (push) Blocked by required conditions
build-staging / go-build (push) Blocked by required conditions
build-staging / staging (push) Blocked by required conditions
Release Drafter / update_release_draft (push) Waiting to run
* fix(querier): return proper HTTP status for PromQL timeout errors

PromQL queries hitting the context deadline were incorrectly returning
400 Bad Request with "invalid_input" because enhancePromQLError
unconditionally wrapped all errors as TypeInvalidInput. Extract
tryEnhancePromQLExecError to properly classify timeout, cancellation,
and storage errors before falling through to parse error handling.

Also make the PromQL engine timeout configurable via prometheus.timeout
config (default 2m) instead of hardcoding it.

* chore: refactor files

* fix(prometheus): validate timeout config and fix test setups

Add validation in prometheus.Config to reject zero timeout. Update all
test files to explicitly set Timeout: 2 * time.Minute in prometheus.Config
literals to avoid immediate query timeouts.
2026-03-24 13:32:45 +00:00
15 changed files with 73 additions and 50 deletions

View File

@@ -144,6 +144,8 @@ telemetrystore:
##################### Prometheus #####################
prometheus:
# The maximum time a PromQL query is allowed to run before being aborted.
timeout: 2m
active_query_tracker:
# Whether to enable the active query tracker.
enabled: true

View File

@@ -257,7 +257,7 @@ func TestManager_TestNotification_SendUnmatched_PromRule(t *testing.T) {
WillReturnRows(samplesRows)
// Create Prometheus provider for this test
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, store)
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, store)
},
ManagerOptionsHook: func(opts *rules.ManagerOptions) {
// Set Prometheus provider for PromQL queries

View File

@@ -337,9 +337,9 @@ function GraphLayout(props: GraphLayoutProps): JSX.Element {
for (let j = idxCurrentRow + 1; j < dashboardLayout.length; j++) {
updatedDashboardLayout[j].y += maxY;
if (updatedPanelMap[updatedDashboardLayout[j].i]) {
updatedPanelMap[updatedDashboardLayout[j].i].widgets = updatedPanelMap[
updatedDashboardLayout[j].i
].widgets.map((w) => ({
updatedPanelMap[updatedDashboardLayout[j].i].widgets = (
updatedPanelMap[updatedDashboardLayout[j].i].widgets ?? []
).map((w) => ({
...w,
y: w.y + maxY,
}));
@@ -376,9 +376,9 @@ function GraphLayout(props: GraphLayoutProps): JSX.Element {
for (let j = currentIdx + 1; j < updatedDashboardLayout.length; j++) {
updatedDashboardLayout[j].y += maxY;
if (updatedPanelMap[updatedDashboardLayout[j].i]) {
updatedPanelMap[updatedDashboardLayout[j].i].widgets = updatedPanelMap[
updatedDashboardLayout[j].i
].widgets.map((w) => ({
updatedPanelMap[updatedDashboardLayout[j].i].widgets = (
updatedPanelMap[updatedDashboardLayout[j].i].widgets ?? []
).map((w) => ({
...w,
y: w.y + maxY,
}));

View File

@@ -3,6 +3,7 @@ package prometheus
import (
"time"
"github.com/SigNoz/signoz/pkg/errors"
"github.com/SigNoz/signoz/pkg/factory"
)
@@ -20,6 +21,9 @@ type Config struct {
//
// If not set, the prometheus default is used (currently 5m).
LookbackDelta time.Duration `mapstructure:"lookback_delta"`
// Timeout is the maximum time a query is allowed to run before being aborted.
Timeout time.Duration `mapstructure:"timeout"`
}
func NewConfigFactory() factory.ConfigFactory {
@@ -33,10 +37,14 @@ func newConfig() factory.Config {
Path: "",
MaxConcurrent: 20,
},
Timeout: 2 * time.Minute,
}
}
func (c Config) Validate() error {
if c.Timeout <= 0 {
return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "prometheus::timeout must be greater than 0")
}
return nil
}

View File

@@ -1,3 +0,0 @@
package prometheus
const FingerprintAsPromLabelName = "fingerprint"

View File

@@ -2,7 +2,6 @@ package prometheus
import (
"log/slog"
"time"
"github.com/prometheus/prometheus/promql"
)
@@ -21,7 +20,7 @@ func NewEngine(logger *slog.Logger, cfg Config) *Engine {
Logger: logger,
Reg: nil,
MaxSamples: 5_0000_000,
Timeout: 2 * time.Minute,
Timeout: cfg.Timeout,
ActiveQueryTracker: activeQueryTracker,
LookbackDelta: cfg.LookbackDelta,
})

View File

@@ -6,6 +6,8 @@ import (
"github.com/prometheus/prometheus/promql"
)
const FingerprintAsPromLabelName string = "fingerprint"
func RemoveExtraLabels(res *promql.Result, labelsToRemove ...string) error {
if len(labelsToRemove) == 0 || res == nil {
return nil

View File

@@ -36,6 +36,28 @@ var unquotedDottedNamePattern = regexp.MustCompile(`(?:^|[{,(\s])([a-zA-Z_][a-zA
// This is a common mistake when migrating to UTF-8 syntax.
var quotedMetricOutsideBracesPattern = regexp.MustCompile(`"([^"]+)"\s*\{`)
// tryEnhancePromQLExecError attempts to convert a PromQL execution error into
// a properly typed error. Returns nil if the error is not a recognized execution error.
func tryEnhancePromQLExecError(execErr error) error {
var eqc promql.ErrQueryCanceled
var eqt promql.ErrQueryTimeout
var es promql.ErrStorage
switch {
case errors.As(execErr, &eqc):
return errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled").WithAdditional(eqc.Error())
case errors.As(execErr, &eqt):
return errors.Newf(errors.TypeTimeout, errors.CodeTimeout, "query timed out").WithAdditional(eqt.Error())
case errors.Is(execErr, context.DeadlineExceeded):
return errors.Newf(errors.TypeTimeout, errors.CodeTimeout, "query timed out")
case errors.Is(execErr, context.Canceled):
return errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled")
case errors.As(execErr, &es):
return errors.Newf(errors.TypeInternal, errors.CodeInternal, "query execution error: %v", execErr)
default:
return nil
}
}
// enhancePromQLError adds helpful context to PromQL parse errors,
// particularly for UTF-8 syntax migration issues where metric and label
// names containing dots need to be quoted.
@@ -213,27 +235,20 @@ func (q *promqlQuery) Execute(ctx context.Context) (*qbv5.Result, error) {
time.Unix(0, end),
q.query.Step.Duration,
)
if err != nil {
// NewRangeQuery can fail with execution errors (e.g. context deadline exceeded)
// during the query queue/scheduling stage, not just parse errors.
if err := tryEnhancePromQLExecError(err); err != nil {
return nil, err
}
return nil, enhancePromQLError(query, err)
}
res := qry.Exec(ctx)
if res.Err != nil {
var eqc promql.ErrQueryCanceled
var eqt promql.ErrQueryTimeout
var es promql.ErrStorage
switch {
case errors.As(res.Err, &eqc):
return nil, errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled")
case errors.As(res.Err, &eqt):
return nil, errors.Newf(errors.TypeTimeout, errors.CodeTimeout, "query timeout")
case errors.As(res.Err, &es):
return nil, errors.Newf(errors.TypeInternal, errors.CodeInternal, "query execution error: %v", res.Err)
}
if errors.Is(res.Err, context.Canceled) {
return nil, errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled")
if err := tryEnhancePromQLExecError(res.Err); err != nil {
return nil, err
}
return nil, errors.Newf(errors.TypeInternal, errors.CodeInternal, "query execution error: %v", res.Err)

View File

@@ -1407,7 +1407,7 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1633,7 +1633,7 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1934,7 +1934,7 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -2162,7 +2162,7 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,

View File

@@ -1459,7 +1459,7 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1685,7 +1685,7 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1985,7 +1985,7 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -2213,7 +2213,7 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,

View File

@@ -698,7 +698,7 @@ func TestBaseRule_FilterNewSeries(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), settings, prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), settings, prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Second,
nil,

View File

@@ -253,7 +253,7 @@ func TestManager_TestNotification_SendUnmatched_PromRule(t *testing.T) {
WillReturnRows(samplesRows)
// Create Prometheus provider for this test
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, store)
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, store)
},
ManagerOptionsHook: func(opts *ManagerOptions) {
// Set Prometheus provider for PromQL queries

View File

@@ -99,7 +99,7 @@ func NewTestManager(t *testing.T, testOpts *TestManagerOptions) *Manager {
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
providerSettings := instrumentationtest.New().ToProviderSettings()
prometheus := prometheustest.New(context.Background(), providerSettings, prometheus.Config{}, telemetryStore)
prometheus := prometheustest.New(context.Background(), providerSettings, prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
reader := clickhouseReader.NewReader(
instrumentationtest.New().Logger(),
nil,

View File

@@ -940,7 +940,7 @@ func TestPromRuleUnitCombinations(t *testing.T) {
).
WillReturnRows(samplesRows)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
postableRule.RuleCondition.CompareOp = ruletypes.CompareOp(c.compareOp)
postableRule.RuleCondition.MatchType = ruletypes.MatchType(c.matchType)
@@ -1061,7 +1061,7 @@ func _Enable_this_after_9146_issue_fix_is_merged_TestPromRuleNoData(t *testing.T
WithArgs("test_metric", "__name__", "test_metric").
WillReturnRows(fingerprintRows)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
var target float64 = 0
postableRule.RuleCondition.Thresholds = &ruletypes.RuleThresholdData{
@@ -1281,7 +1281,7 @@ func TestMultipleThresholdPromRule(t *testing.T) {
).
WillReturnRows(samplesRows)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
postableRule.RuleCondition.CompareOp = ruletypes.CompareOp(c.compareOp)
postableRule.RuleCondition.MatchType = ruletypes.MatchType(c.matchType)
@@ -1441,7 +1441,7 @@ func TestPromRule_NoData(t *testing.T) {
promProvider := prometheustest.New(
context.Background(),
instrumentationtest.New().ToProviderSettings(),
prometheus.Config{},
prometheus.Config{Timeout: 2 * time.Minute},
telemetryStore,
)
defer func() {
@@ -1590,7 +1590,7 @@ func TestPromRule_NoData_AbsentFor(t *testing.T) {
promProvider := prometheustest.New(
context.Background(),
instrumentationtest.New().ToProviderSettings(),
prometheus.Config{},
prometheus.Config{Timeout: 2 * time.Minute},
telemetryStore,
)
defer func() {
@@ -1748,7 +1748,7 @@ func TestPromRuleEval_RequireMinPoints(t *testing.T) {
promProvider := prometheustest.New(
context.Background(),
instrumentationtest.New().ToProviderSettings(),
prometheus.Config{LookbackDelta: lookBackDelta},
prometheus.Config{Timeout: 2 * time.Minute, LookbackDelta: lookBackDelta},
telemetryStore,
)
defer func() {

View File

@@ -779,7 +779,7 @@ func TestThresholdRuleUnitCombinations(t *testing.T) {
},
)
require.NoError(t, err)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
"signoz_calls_total": {
@@ -894,7 +894,7 @@ func TestThresholdRuleNoData(t *testing.T) {
)
assert.NoError(t, err)
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
@@ -1014,7 +1014,7 @@ func TestThresholdRuleTracesLink(t *testing.T) {
}
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
@@ -1151,7 +1151,7 @@ func TestThresholdRuleLogsLink(t *testing.T) {
}
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
@@ -1418,7 +1418,7 @@ func TestMultipleThresholdRule(t *testing.T) {
},
)
require.NoError(t, err)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Second, nil, readerCache, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Second, nil, readerCache, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
"signoz_calls_total": {
@@ -2220,7 +2220,7 @@ func TestThresholdEval_RequireMinPoints(t *testing.T) {
)
require.NoError(t, err)
prometheusProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
prometheusProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheusProvider, "", time.Second, nil, readerCache, options)
rule, err := NewThresholdRule("some-id", valuer.GenerateUUID(), &postableRule, reader, nil, logger)