Compare commits

...

3 Commits

Author SHA1 Message Date
Ashwin Bhatkal
3047c79a0c test: add tests for errorDetails with APIError and plain Error 2026-03-24 21:33:02 +05:30
Ashwin Bhatkal
aa40929d73 fix: guard getErrorDetails call against non-APIError instances in GridCard 2026-03-24 20:14:37 +05:30
Pandey
234716df53 fix(querier): return proper HTTP status for PromQL timeout errors (#10689)
Some checks are pending
build-staging / prepare (push) Waiting to run
build-staging / js-build (push) Blocked by required conditions
build-staging / go-build (push) Blocked by required conditions
build-staging / staging (push) Blocked by required conditions
Release Drafter / update_release_draft (push) Waiting to run
* fix(querier): return proper HTTP status for PromQL timeout errors

PromQL queries hitting the context deadline were incorrectly returning
400 Bad Request with "invalid_input" because enhancePromQLError
unconditionally wrapped all errors as TypeInvalidInput. Extract
tryEnhancePromQLExecError to properly classify timeout, cancellation,
and storage errors before falling through to parse error handling.

Also make the PromQL engine timeout configurable via prometheus.timeout
config (default 2m) instead of hardcoding it.

* chore: refactor files

* fix(prometheus): validate timeout config and fix test setups

Add validation in prometheus.Config to reject zero timeout. Update all
test files to explicitly set Timeout: 2 * time.Minute in prometheus.Config
literals to avoid immediate query timeouts.
2026-03-24 13:32:45 +00:00
16 changed files with 138 additions and 49 deletions

View File

@@ -144,6 +144,8 @@ telemetrystore:
##################### Prometheus #####################
prometheus:
# The maximum time a PromQL query is allowed to run before being aborted.
timeout: 2m
active_query_tracker:
# Whether to enable the active query tracker.
enabled: true

View File

@@ -257,7 +257,7 @@ func TestManager_TestNotification_SendUnmatched_PromRule(t *testing.T) {
WillReturnRows(samplesRows)
// Create Prometheus provider for this test
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, store)
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, store)
},
ManagerOptionsHook: func(opts *rules.ManagerOptions) {
// Set Prometheus provider for PromQL queries

View File

@@ -0,0 +1,65 @@
import APIError from 'types/api/error';
import { errorDetails } from '../utils';
function makeAPIError(
message: string,
code = 'SOME_CODE',
errors: { message: string }[] = [],
): APIError {
return new APIError({
httpStatusCode: 500,
error: { code, message, url: '', errors },
});
}
describe('errorDetails', () => {
describe('when passed an APIError', () => {
it('returns the error message', () => {
const error = makeAPIError('something went wrong');
expect(errorDetails(error)).toBe('something went wrong');
});
it('appends details when errors array is non-empty', () => {
const error = makeAPIError('query failed', 'QUERY_ERROR', [
{ message: 'field X is invalid' },
{ message: 'field Y is missing' },
]);
const result = errorDetails(error);
expect(result).toContain('query failed');
expect(result).toContain('field X is invalid');
expect(result).toContain('field Y is missing');
});
it('does not append details when errors array is empty', () => {
const error = makeAPIError('simple error', 'CODE', []);
const result = errorDetails(error);
expect(result).toBe('simple error');
expect(result).not.toContain('Details');
});
});
describe('when passed a plain Error (not an APIError)', () => {
it('does not throw', () => {
const error = new Error('timeout exceeded');
expect(() => errorDetails(error)).not.toThrow();
});
it('returns the plain error message', () => {
const error = new Error('timeout exceeded');
expect(errorDetails(error)).toBe('timeout exceeded');
});
it('returns fallback when plain Error has no message', () => {
const error = new Error('');
expect(errorDetails(error)).toBe('Unknown error occurred');
});
});
describe('fallback behaviour', () => {
it('returns "Unknown error occurred" when message is undefined', () => {
const error = makeAPIError('');
expect(errorDetails(error)).toBe('Unknown error occurred');
});
});
});

View File

@@ -249,13 +249,14 @@ export const handleGraphClick = async ({
}
};
export const errorDetails = (error: APIError): string => {
const { message, errors } = error.getErrorDetails()?.error || {};
export const errorDetails = (error: APIError | Error): string => {
const { message, errors } =
(error instanceof APIError ? error.getErrorDetails()?.error : null) || {};
const details =
errors?.length > 0
errors && errors.length > 0
? `\n\nDetails: ${errors.map((e) => e.message).join('\n')}`
: '';
const errorDetails = `${message} ${details}`;
return errorDetails || 'Unknown error occurred';
const errorDetails = `${message ?? error.message} ${details}`;
return errorDetails.trim() || 'Unknown error occurred';
};

View File

@@ -3,6 +3,7 @@ package prometheus
import (
"time"
"github.com/SigNoz/signoz/pkg/errors"
"github.com/SigNoz/signoz/pkg/factory"
)
@@ -20,6 +21,9 @@ type Config struct {
//
// If not set, the prometheus default is used (currently 5m).
LookbackDelta time.Duration `mapstructure:"lookback_delta"`
// Timeout is the maximum time a query is allowed to run before being aborted.
Timeout time.Duration `mapstructure:"timeout"`
}
func NewConfigFactory() factory.ConfigFactory {
@@ -33,10 +37,14 @@ func newConfig() factory.Config {
Path: "",
MaxConcurrent: 20,
},
Timeout: 2 * time.Minute,
}
}
func (c Config) Validate() error {
if c.Timeout <= 0 {
return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "prometheus::timeout must be greater than 0")
}
return nil
}

View File

@@ -1,3 +0,0 @@
package prometheus
const FingerprintAsPromLabelName = "fingerprint"

View File

@@ -2,7 +2,6 @@ package prometheus
import (
"log/slog"
"time"
"github.com/prometheus/prometheus/promql"
)
@@ -21,7 +20,7 @@ func NewEngine(logger *slog.Logger, cfg Config) *Engine {
Logger: logger,
Reg: nil,
MaxSamples: 5_0000_000,
Timeout: 2 * time.Minute,
Timeout: cfg.Timeout,
ActiveQueryTracker: activeQueryTracker,
LookbackDelta: cfg.LookbackDelta,
})

View File

@@ -6,6 +6,8 @@ import (
"github.com/prometheus/prometheus/promql"
)
const FingerprintAsPromLabelName string = "fingerprint"
func RemoveExtraLabels(res *promql.Result, labelsToRemove ...string) error {
if len(labelsToRemove) == 0 || res == nil {
return nil

View File

@@ -36,6 +36,28 @@ var unquotedDottedNamePattern = regexp.MustCompile(`(?:^|[{,(\s])([a-zA-Z_][a-zA
// This is a common mistake when migrating to UTF-8 syntax.
var quotedMetricOutsideBracesPattern = regexp.MustCompile(`"([^"]+)"\s*\{`)
// tryEnhancePromQLExecError attempts to convert a PromQL execution error into
// a properly typed error. Returns nil if the error is not a recognized execution error.
func tryEnhancePromQLExecError(execErr error) error {
var eqc promql.ErrQueryCanceled
var eqt promql.ErrQueryTimeout
var es promql.ErrStorage
switch {
case errors.As(execErr, &eqc):
return errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled").WithAdditional(eqc.Error())
case errors.As(execErr, &eqt):
return errors.Newf(errors.TypeTimeout, errors.CodeTimeout, "query timed out").WithAdditional(eqt.Error())
case errors.Is(execErr, context.DeadlineExceeded):
return errors.Newf(errors.TypeTimeout, errors.CodeTimeout, "query timed out")
case errors.Is(execErr, context.Canceled):
return errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled")
case errors.As(execErr, &es):
return errors.Newf(errors.TypeInternal, errors.CodeInternal, "query execution error: %v", execErr)
default:
return nil
}
}
// enhancePromQLError adds helpful context to PromQL parse errors,
// particularly for UTF-8 syntax migration issues where metric and label
// names containing dots need to be quoted.
@@ -213,27 +235,20 @@ func (q *promqlQuery) Execute(ctx context.Context) (*qbv5.Result, error) {
time.Unix(0, end),
q.query.Step.Duration,
)
if err != nil {
// NewRangeQuery can fail with execution errors (e.g. context deadline exceeded)
// during the query queue/scheduling stage, not just parse errors.
if err := tryEnhancePromQLExecError(err); err != nil {
return nil, err
}
return nil, enhancePromQLError(query, err)
}
res := qry.Exec(ctx)
if res.Err != nil {
var eqc promql.ErrQueryCanceled
var eqt promql.ErrQueryTimeout
var es promql.ErrStorage
switch {
case errors.As(res.Err, &eqc):
return nil, errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled")
case errors.As(res.Err, &eqt):
return nil, errors.Newf(errors.TypeTimeout, errors.CodeTimeout, "query timeout")
case errors.As(res.Err, &es):
return nil, errors.Newf(errors.TypeInternal, errors.CodeInternal, "query execution error: %v", res.Err)
}
if errors.Is(res.Err, context.Canceled) {
return nil, errors.Newf(errors.TypeCanceled, errors.CodeCanceled, "query canceled")
if err := tryEnhancePromQLExecError(res.Err); err != nil {
return nil, err
}
return nil, errors.Newf(errors.TypeInternal, errors.CodeInternal, "query execution error: %v", res.Err)

View File

@@ -1407,7 +1407,7 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1633,7 +1633,7 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1934,7 +1934,7 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -2162,7 +2162,7 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,

View File

@@ -1459,7 +1459,7 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1685,7 +1685,7 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -1985,7 +1985,7 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
@@ -2213,7 +2213,7 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,

View File

@@ -698,7 +698,7 @@ func TestBaseRule_FilterNewSeries(t *testing.T) {
slog.Default(),
nil,
telemetryStore,
prometheustest.New(context.Background(), settings, prometheus.Config{}, telemetryStore),
prometheustest.New(context.Background(), settings, prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Second,
nil,

View File

@@ -253,7 +253,7 @@ func TestManager_TestNotification_SendUnmatched_PromRule(t *testing.T) {
WillReturnRows(samplesRows)
// Create Prometheus provider for this test
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, store)
promProvider = prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, store)
},
ManagerOptionsHook: func(opts *ManagerOptions) {
// Set Prometheus provider for PromQL queries

View File

@@ -99,7 +99,7 @@ func NewTestManager(t *testing.T, testOpts *TestManagerOptions) *Manager {
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
providerSettings := instrumentationtest.New().ToProviderSettings()
prometheus := prometheustest.New(context.Background(), providerSettings, prometheus.Config{}, telemetryStore)
prometheus := prometheustest.New(context.Background(), providerSettings, prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
reader := clickhouseReader.NewReader(
instrumentationtest.New().Logger(),
nil,

View File

@@ -940,7 +940,7 @@ func TestPromRuleUnitCombinations(t *testing.T) {
).
WillReturnRows(samplesRows)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
postableRule.RuleCondition.CompareOp = ruletypes.CompareOp(c.compareOp)
postableRule.RuleCondition.MatchType = ruletypes.MatchType(c.matchType)
@@ -1061,7 +1061,7 @@ func _Enable_this_after_9146_issue_fix_is_merged_TestPromRuleNoData(t *testing.T
WithArgs("test_metric", "__name__", "test_metric").
WillReturnRows(fingerprintRows)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
var target float64 = 0
postableRule.RuleCondition.Thresholds = &ruletypes.RuleThresholdData{
@@ -1281,7 +1281,7 @@ func TestMultipleThresholdPromRule(t *testing.T) {
).
WillReturnRows(samplesRows)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
promProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
postableRule.RuleCondition.CompareOp = ruletypes.CompareOp(c.compareOp)
postableRule.RuleCondition.MatchType = ruletypes.MatchType(c.matchType)
@@ -1441,7 +1441,7 @@ func TestPromRule_NoData(t *testing.T) {
promProvider := prometheustest.New(
context.Background(),
instrumentationtest.New().ToProviderSettings(),
prometheus.Config{},
prometheus.Config{Timeout: 2 * time.Minute},
telemetryStore,
)
defer func() {
@@ -1590,7 +1590,7 @@ func TestPromRule_NoData_AbsentFor(t *testing.T) {
promProvider := prometheustest.New(
context.Background(),
instrumentationtest.New().ToProviderSettings(),
prometheus.Config{},
prometheus.Config{Timeout: 2 * time.Minute},
telemetryStore,
)
defer func() {
@@ -1748,7 +1748,7 @@ func TestPromRuleEval_RequireMinPoints(t *testing.T) {
promProvider := prometheustest.New(
context.Background(),
instrumentationtest.New().ToProviderSettings(),
prometheus.Config{LookbackDelta: lookBackDelta},
prometheus.Config{Timeout: 2 * time.Minute, LookbackDelta: lookBackDelta},
telemetryStore,
)
defer func() {

View File

@@ -779,7 +779,7 @@ func TestThresholdRuleUnitCombinations(t *testing.T) {
},
)
require.NoError(t, err)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
"signoz_calls_total": {
@@ -894,7 +894,7 @@ func TestThresholdRuleNoData(t *testing.T) {
)
assert.NoError(t, err)
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, readerCache, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
@@ -1014,7 +1014,7 @@ func TestThresholdRuleTracesLink(t *testing.T) {
}
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
@@ -1151,7 +1151,7 @@ func TestThresholdRuleLogsLink(t *testing.T) {
}
options := clickhouseReader.NewOptions("", "", "archiveNamespace")
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Duration(time.Second), nil, nil, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
@@ -1418,7 +1418,7 @@ func TestMultipleThresholdRule(t *testing.T) {
},
)
require.NoError(t, err)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore), "", time.Second, nil, readerCache, options)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore), "", time.Second, nil, readerCache, options)
rule, err := NewThresholdRule("69", valuer.GenerateUUID(), &postableRule, reader, nil, logger)
rule.TemporalityMap = map[string]map[v3.Temporality]bool{
"signoz_calls_total": {
@@ -2220,7 +2220,7 @@ func TestThresholdEval_RequireMinPoints(t *testing.T) {
)
require.NoError(t, err)
prometheusProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{}, telemetryStore)
prometheusProvider := prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore)
reader := clickhouseReader.NewReader(slog.Default(), nil, telemetryStore, prometheusProvider, "", time.Second, nil, readerCache, options)
rule, err := NewThresholdRule("some-id", valuer.GenerateUUID(), &postableRule, reader, nil, logger)