Compare commits

..

2 Commits

Author SHA1 Message Date
srikanthccv
7c85c63555 chore: update integration test 2026-06-17 18:01:47 +05:30
srikanthccv
f74ca236fa chore: send warning instead of error for unseen metrics and missing (metric, key) 2026-06-17 16:25:55 +05:30
17 changed files with 756 additions and 122 deletions

View File

@@ -10,6 +10,7 @@ import (
"go.opentelemetry.io/contrib/instrumentation/github.com/gorilla/mux/otelmux"
"go.opentelemetry.io/otel/propagation"
"github.com/SigNoz/signoz/pkg/cache/memorycache"
"github.com/SigNoz/signoz/pkg/errors"
"github.com/gorilla/handlers"
@@ -19,6 +20,7 @@ import (
"github.com/SigNoz/signoz/ee/query-service/app/api"
"github.com/SigNoz/signoz/ee/query-service/usage"
"github.com/SigNoz/signoz/pkg/cache"
"github.com/SigNoz/signoz/pkg/http/middleware"
"github.com/SigNoz/signoz/pkg/signoz"
"github.com/SigNoz/signoz/pkg/web"
@@ -57,12 +59,25 @@ type Server struct {
// NewServer creates and initializes Server
func NewServer(config signoz.Config, signoz *signoz.SigNoz) (*Server, error) {
cacheForTraceDetail, err := memorycache.New(context.TODO(), signoz.Instrumentation.ToProviderSettings(), cache.Config{
Provider: "memory",
Memory: cache.Memory{
NumCounters: 10 * 10000,
MaxCost: 1 << 27, // 128 MB
},
})
if err != nil {
return nil, err
}
reader := clickhouseReader.NewReader(
signoz.Instrumentation.Logger(),
signoz.SQLStore,
signoz.TelemetryStore,
signoz.Prometheus,
signoz.TelemetryStore.Cluster(),
config.Querier.FluxInterval,
cacheForTraceDetail,
signoz.Cache,
nil,
)

View File

@@ -119,11 +119,7 @@ func (q *querier) QueryRange(ctx context.Context, orgID valuer.UUID, req *qbtype
queries := make(map[string]qbtypes.Query)
steps := make(map[string]qbtypes.Step)
// Resolve metric metadata once per request: patches each metric-aggregation
// query's spec in place, returns the queries whose every aggregation was
// missing (used for preseeded empty results), and any dormant-metric
// warning string. NotFound errors for never-seen metrics are propagated.
missingMetricQueries, dormantMetricsWarningMsg, err := q.resolveMetricMetadata(ctx, req.CompositeQuery.Queries, req.Start, req.End)
missingMetricQueries, metricWarnings, err := q.resolveMetricMetadata(ctx, req.CompositeQuery.Queries, req.Start, req.End)
if err != nil {
return nil, err
}
@@ -240,13 +236,15 @@ func (q *querier) QueryRange(ctx context.Context, orgID valuer.UUID, req *qbtype
}
}
}
if dormantMetricsWarningMsg != "" {
if len(metricWarnings) > 0 {
if qbResp.Warning == nil {
qbResp.Warning = &qbtypes.QueryWarnData{}
}
qbResp.Warning.Warnings = append(qbResp.Warning.Warnings, qbtypes.QueryWarnDataAdditional{
Message: dormantMetricsWarningMsg,
})
for _, w := range metricWarnings {
qbResp.Warning.Warnings = append(qbResp.Warning.Warnings, qbtypes.QueryWarnDataAdditional{
Message: w,
})
}
}
}
return qbResp, qbErr
@@ -302,12 +300,11 @@ func (q *querier) populateQBEvent(event *qbtypes.QBEvent, queries []qbtypes.Quer
// - missingMetricQueries: names of queries whose every aggregation was
// missing. Used downstream to preseed empty result placeholders so the
// response still has an entry per requested query name.
// - dormantWarning: a human-readable warning describing metrics that exist in
// the store but produced no data within the query window. Empty when no
// such metrics are present.
// - err: NotFound when one or more referenced metrics have never been seen,
// or Internal when a metadata fetch fails.
func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.QueryEnvelope, start, end uint64) (missingMetricQueries []string, dormantWarning string, err error) {
// - metricWarnings: human-readable warnings for metrics that could not be
// resolved: never-seen metrics and dormant metrics (seen but no data in
// the query window).
// - err: Internal when a metadata fetch fails.
func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.QueryEnvelope, start, end uint64) (missingMetricQueries []string, metricWarnings []string, err error) {
metricNames := make([]string, 0)
for idx := range queries {
if queries[idx].Type != qbtypes.QueryTypeBuilder {
@@ -325,13 +322,13 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
}
if len(metricNames) == 0 {
return nil, "", nil
return nil, nil, nil
}
metricTemporality, metricTypes, err := q.metadataStore.FetchTemporalityAndTypeMulti(ctx, start, end, metricNames...)
if err != nil {
q.logger.WarnContext(ctx, "failed to fetch metric temporality", errors.Attr(err), slog.Any("metrics", metricNames))
return nil, "", errors.NewInternalf(errors.CodeInternal, "failed to fetch metrics temporality")
return nil, nil, errors.NewInternalf(errors.CodeInternal, "failed to fetch metrics temporality")
}
q.logger.DebugContext(ctx, "fetched metric temporalities and types", slog.Any("metric_temporality", metricTemporality), slog.Any("metric_types", metricTypes))
@@ -363,7 +360,7 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
}
// Type is resolved now; validate aggregation compatibility against it.
if err := spec.Aggregations[i].ValidateForType(); err != nil {
return nil, "", err
return nil, nil, err
}
presentAggregations = append(presentAggregations, spec.Aggregations[i])
}
@@ -376,7 +373,7 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
}
if len(missingMetrics) == 0 {
return missingMetricQueries, "", nil
return missingMetricQueries, nil, nil
}
isInternalMetric := func(n string) bool { return strings.HasPrefix(n, "signoz.") || strings.HasPrefix(n, "signoz_") }
@@ -387,29 +384,33 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
}
}
if len(externalMissingMetrics) == 0 {
// this means all missing metrics are internal, and since internal metrics
// aren't user-controlled, skip errors/warnings for them since users can't act on them
return missingMetricQueries, "", nil
return missingMetricQueries, nil, nil
}
// Classify each missing metric: never-seen → NotFound error; seen-but-no-
// data-in-window dormant warning.
// Classify each missing metric: never-seen -> warning with empty result;
// seen-but-no-data-in-window -> dormant warning.
lastSeenInfo, _ := q.metadataStore.FetchLastSeenInfoMulti(ctx, externalMissingMetrics...)
nonExistentMetrics := []string{}
var nonExistentMetrics []string
var dormantMetrics []string
for _, name := range externalMissingMetrics {
if ts, ok := lastSeenInfo[name]; ok && ts > 0 {
dormantMetrics = append(dormantMetrics, name)
continue
}
nonExistentMetrics = append(nonExistentMetrics, name)
}
var warnings []string
// Never-seen metrics: the query already gets a preseeded empty result
// via the aggregation-dropping path above; we just attach a warning.
if len(nonExistentMetrics) == 1 {
return nil, "", errors.NewNotFoundf(errors.CodeNotFound, "could not find the metric %s", nonExistentMetrics[0])
}
if len(nonExistentMetrics) > 1 {
return nil, "", errors.NewNotFoundf(errors.CodeNotFound, "the following metrics were not found: %s", strings.Join(nonExistentMetrics, ", "))
warnings = append(warnings, fmt.Sprintf("metric %s has never been received. Check the metric name and instrumentation", nonExistentMetrics[0]))
} else if len(nonExistentMetrics) > 1 {
warnings = append(warnings, fmt.Sprintf("the following metrics have never been received. Check the metric names and instrumentation: %s", strings.Join(nonExistentMetrics, ", ")))
}
// All missing metrics are dormant — assemble the warning string.
// Dormant metrics: seen before but no data in the query window.
lastSeenStr := func(name string) string {
if ts, ok := lastSeenInfo[name]; ok && ts > 0 {
ago := humanize.RelTime(time.UnixMilli(ts), time.Now(), "ago", "from now")
@@ -417,16 +418,16 @@ func (q *querier) resolveMetricMetadata(ctx context.Context, queries []qbtypes.Q
}
return name
}
if len(externalMissingMetrics) == 1 {
dormantWarning = fmt.Sprintf("no data found for the metric %s in the query time range", lastSeenStr(missingMetrics[0]))
} else {
parts := make([]string, len(externalMissingMetrics))
for i, m := range externalMissingMetrics {
if len(dormantMetrics) == 1 {
warnings = append(warnings, fmt.Sprintf("no data found for the metric %s in the query time range", lastSeenStr(dormantMetrics[0])))
} else if len(dormantMetrics) > 1 {
parts := make([]string, len(dormantMetrics))
for i, m := range dormantMetrics {
parts[i] = lastSeenStr(m)
}
dormantWarning = fmt.Sprintf("no data found for the following metrics in the query time range: %s", strings.Join(parts, ", "))
warnings = append(warnings, fmt.Sprintf("no data found for the following metrics in the query time range: %s", strings.Join(parts, ", ")))
}
return missingMetricQueries, dormantWarning, nil
return missingMetricQueries, warnings, nil
}
func (q *querier) QueryRawStream(ctx context.Context, orgID valuer.UUID, req *qbtypes.QueryRangeRequest, client *qbtypes.RawStream) {

View File

@@ -37,7 +37,7 @@ func (m *mockMetricStmtBuilder) Build(_ context.Context, _, _ uint64, _ qbtypes.
func TestQueryRange_MetricTypeMissing(t *testing.T) {
// When a metric has UnspecifiedType and is not found in the metadata store,
// the querier should return a not-found error, even if the request provides a temporality
// the querier should return an empty result with a warning instead of an error.
providerSettings := instrumentationtest.New().ToProviderSettings()
metadataStore := telemetrytypestest.NewMockMetadataStore()
@@ -80,9 +80,19 @@ func TestQueryRange_MetricTypeMissing(t *testing.T) {
},
}
_, err := q.QueryRange(context.Background(), valuer.GenerateUUID(), req)
require.Error(t, err)
assert.Contains(t, err.Error(), "could not find the metric unknown_metric")
resp, err := q.QueryRange(context.Background(), valuer.GenerateUUID(), req)
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Warning)
found := false
for _, w := range resp.Warning.Warnings {
if assert.ObjectsAreEqual("metric unknown_metric has never been received. Check the metric name and instrumentation", w.Message) {
found = true
break
}
}
assert.True(t, found, "expected never-seen metric warning, got: %v", resp.Warning.Warnings)
}
func TestQueryRange_MetricTypeFromStore(t *testing.T) {

View File

@@ -18,6 +18,7 @@ import (
"github.com/uptrace/bun"
"github.com/SigNoz/signoz/pkg/prometheus"
"github.com/SigNoz/signoz/pkg/query-service/utils/timestamp"
"github.com/SigNoz/signoz/pkg/sqlstore"
"github.com/SigNoz/signoz/pkg/telemetrystore"
"github.com/SigNoz/signoz/pkg/types"
@@ -46,6 +47,7 @@ import (
"github.com/SigNoz/signoz/pkg/query-service/app/resource"
"github.com/SigNoz/signoz/pkg/query-service/app/services"
"github.com/SigNoz/signoz/pkg/query-service/app/traces/smart"
"github.com/SigNoz/signoz/pkg/query-service/app/traces/tracedetail"
"github.com/SigNoz/signoz/pkg/query-service/common"
"github.com/SigNoz/signoz/pkg/query-service/constants"
@@ -159,9 +161,11 @@ type ClickHouseReader struct {
traceResourceTableV3 string
traceSummaryTable string
cache cache.Cache
metadataDB string
metadataTable string
fluxIntervalForTraceDetail time.Duration
cache cache.Cache
cacheForTraceDetail cache.Cache
metadataDB string
metadataTable string
}
// NewTraceReader returns a TraceReader for the database
@@ -171,6 +175,8 @@ func NewReader(
telemetryStore telemetrystore.TelemetryStore,
prometheus prometheus.Prometheus,
cluster string,
fluxIntervalForTraceDetail time.Duration,
cacheForTraceDetail cache.Cache,
cache cache.Cache,
options *Options,
) *ClickHouseReader {
@@ -184,43 +190,45 @@ func NewReader(
traceLocalTableName := options.primary.TraceLocalTableNameV3
return &ClickHouseReader{
db: telemetryStore.ClickhouseDB(),
logger: logger,
prometheus: prometheus,
sqlDB: sqlDB,
TraceDB: options.primary.TraceDB,
operationsTable: options.primary.OperationsTable,
indexTable: options.primary.IndexTable,
errorTable: options.primary.ErrorTable,
usageExplorerTable: options.primary.UsageExplorerTable,
durationTable: options.primary.DurationTable,
SpansTable: options.primary.SpansTable,
spanAttributeTableV2: options.primary.SpanAttributeTableV2,
spanAttributesKeysTable: options.primary.SpanAttributeKeysTable,
dependencyGraphTable: options.primary.DependencyGraphTable,
topLevelOperationsTable: options.primary.TopLevelOperationsTable,
logsDB: options.primary.LogsDB,
logsTable: options.primary.LogsTable,
logsLocalTable: options.primary.LogsLocalTable,
logsAttributeKeys: options.primary.LogsAttributeKeysTable,
logsResourceKeys: options.primary.LogsResourceKeysTable,
logsTagAttributeTableV2: options.primary.LogsTagAttributeTableV2,
liveTailRefreshSeconds: options.primary.LiveTailRefreshSeconds,
cluster: cluster,
queryProgressTracker: queryprogress.NewQueryProgressTracker(logger),
logsTableV2: options.primary.LogsTableV2,
logsLocalTableV2: options.primary.LogsLocalTableV2,
logsResourceTableV2: options.primary.LogsResourceTableV2,
logsResourceLocalTableV2: options.primary.LogsResourceLocalTableV2,
logsTableName: logsTableName,
logsLocalTableName: logsLocalTableName,
traceLocalTableName: traceLocalTableName,
traceTableName: traceTableName,
traceResourceTableV3: options.primary.TraceResourceTableV3,
traceSummaryTable: options.primary.TraceSummaryTable,
cache: cache,
metadataDB: options.primary.MetadataDB,
metadataTable: options.primary.MetadataTable,
db: telemetryStore.ClickhouseDB(),
logger: logger,
prometheus: prometheus,
sqlDB: sqlDB,
TraceDB: options.primary.TraceDB,
operationsTable: options.primary.OperationsTable,
indexTable: options.primary.IndexTable,
errorTable: options.primary.ErrorTable,
usageExplorerTable: options.primary.UsageExplorerTable,
durationTable: options.primary.DurationTable,
SpansTable: options.primary.SpansTable,
spanAttributeTableV2: options.primary.SpanAttributeTableV2,
spanAttributesKeysTable: options.primary.SpanAttributeKeysTable,
dependencyGraphTable: options.primary.DependencyGraphTable,
topLevelOperationsTable: options.primary.TopLevelOperationsTable,
logsDB: options.primary.LogsDB,
logsTable: options.primary.LogsTable,
logsLocalTable: options.primary.LogsLocalTable,
logsAttributeKeys: options.primary.LogsAttributeKeysTable,
logsResourceKeys: options.primary.LogsResourceKeysTable,
logsTagAttributeTableV2: options.primary.LogsTagAttributeTableV2,
liveTailRefreshSeconds: options.primary.LiveTailRefreshSeconds,
cluster: cluster,
queryProgressTracker: queryprogress.NewQueryProgressTracker(logger),
logsTableV2: options.primary.LogsTableV2,
logsLocalTableV2: options.primary.LogsLocalTableV2,
logsResourceTableV2: options.primary.LogsResourceTableV2,
logsResourceLocalTableV2: options.primary.LogsResourceLocalTableV2,
logsTableName: logsTableName,
logsLocalTableName: logsLocalTableName,
traceLocalTableName: traceLocalTableName,
traceTableName: traceTableName,
traceResourceTableV3: options.primary.TraceResourceTableV3,
traceSummaryTable: options.primary.TraceSummaryTable,
fluxIntervalForTraceDetail: fluxIntervalForTraceDetail,
cache: cache,
cacheForTraceDetail: cacheForTraceDetail,
metadataDB: options.primary.MetadataDB,
metadataTable: options.primary.MetadataTable,
}
}
@@ -858,6 +866,206 @@ func (r *ClickHouseReader) GetUsage(ctx context.Context, queryParams *model.GetU
return &usageItems, nil
}
func (r *ClickHouseReader) GetSpansForTrace(ctx context.Context, traceID string, traceDetailsQuery string) ([]model.SpanItemV2, *model.ApiError) {
ctx = ctxtypes.NewContextWithCommentVals(ctx, map[string]string{
instrumentationtypes.TelemetrySignal: telemetrytypes.SignalTraces.StringValue(),
instrumentationtypes.CodeNamespace: "clickhouse-reader",
instrumentationtypes.CodeFunctionName: "GetSpansForTrace",
})
var traceSummary model.TraceSummary
summaryQuery := fmt.Sprintf("SELECT trace_id, min(start) AS start, max(end) AS end, sum(num_spans) AS num_spans FROM %s.%s WHERE trace_id=$1 GROUP BY trace_id", r.TraceDB, r.traceSummaryTable)
err := r.db.QueryRow(ctx, summaryQuery, traceID).Scan(&traceSummary.TraceID, &traceSummary.Start, &traceSummary.End, &traceSummary.NumSpans)
if err != nil {
if err == sql.ErrNoRows {
return []model.SpanItemV2{}, nil
}
r.logger.Error("Error in processing trace summary sql query", errorsV2.Attr(err))
return nil, model.ExecutionError(fmt.Errorf("error in processing trace summary sql query: %w", err))
}
var searchScanResponses []model.SpanItemV2
queryStartTime := time.Now()
err = r.db.Select(ctx, &searchScanResponses, traceDetailsQuery, traceID, strconv.FormatInt(traceSummary.Start.Unix()-1800, 10), strconv.FormatInt(traceSummary.End.Unix(), 10))
r.logger.Info(traceDetailsQuery)
if err != nil {
r.logger.Error("Error in processing sql query", errorsV2.Attr(err))
return nil, model.ExecutionError(fmt.Errorf("error in processing trace data sql query: %w", err))
}
r.logger.Info("trace details query took: ", "duration", time.Since(queryStartTime), "traceID", traceID)
return searchScanResponses, nil
}
func (r *ClickHouseReader) GetFlamegraphSpansForTraceCache(ctx context.Context, orgID valuer.UUID, traceID string) (*model.GetFlamegraphSpansForTraceCache, error) {
cachedTraceData := new(model.GetFlamegraphSpansForTraceCache)
err := r.cacheForTraceDetail.Get(ctx, orgID, strings.Join([]string{"getFlamegraphSpansForTrace", traceID}, "-"), cachedTraceData)
if err != nil {
r.logger.Debug("error in retrieving getFlamegraphSpansForTrace cache", errorsV2.Attr(err), "traceID", traceID)
return nil, err
}
if time.Since(time.UnixMilli(int64(cachedTraceData.EndTime))) < r.fluxIntervalForTraceDetail {
r.logger.Info("the trace end time falls under the flux interval, skipping getFlamegraphSpansForTrace cache", "traceID", traceID)
return nil, errors.Errorf("the trace end time falls under the flux interval, skipping getFlamegraphSpansForTrace cache, traceID: %s", traceID)
}
r.logger.Info("cache is successfully hit, applying cache for getFlamegraphSpansForTrace", "traceID", traceID)
return cachedTraceData, nil
}
func (r *ClickHouseReader) GetFlamegraphSpansForTrace(ctx context.Context, orgID valuer.UUID, traceID string, req *model.GetFlamegraphSpansForTraceParams) (*model.GetFlamegraphSpansForTraceResponse, error) {
trace := new(model.GetFlamegraphSpansForTraceResponse)
var startTime, endTime, durationNano uint64
var spanIdToSpanNodeMap = map[string]*model.FlamegraphSpan{}
// map[traceID][level]span
var selectedSpans = [][]*model.FlamegraphSpan{}
var traceRoots []*model.FlamegraphSpan
// get the trace tree from cache!
cachedTraceData, err := r.GetFlamegraphSpansForTraceCache(ctx, orgID, traceID)
if err == nil {
startTime = cachedTraceData.StartTime
endTime = cachedTraceData.EndTime
durationNano = cachedTraceData.DurationNano
selectedSpans = cachedTraceData.SelectedSpans
traceRoots = cachedTraceData.TraceRoots
}
if err != nil {
r.logger.Info("cache miss for getFlamegraphSpansForTrace", "traceID", traceID)
selectCols := "timestamp, duration_nano, span_id, trace_id, has_error, links as references, resource_string_service$$name, name, events"
if len(req.SelectFields) > 0 {
selectCols += ", attributes_string, attributes_number, attributes_bool, resources_string"
}
flamegraphQuery := fmt.Sprintf("SELECT %s FROM %s.%s WHERE trace_id=$1 and ts_bucket_start>=$2 and ts_bucket_start<=$3 ORDER BY timestamp ASC, name ASC", selectCols, r.TraceDB, r.traceTableName)
searchScanResponses, err := r.GetSpansForTrace(ctx, traceID, flamegraphQuery)
if err != nil {
return nil, err
}
if len(searchScanResponses) == 0 {
return trace, nil
}
for _, item := range searchScanResponses {
ref := []model.OtelSpanRef{}
err := json.Unmarshal([]byte(item.References), &ref)
if err != nil {
r.logger.Error("Error unmarshalling references", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling references %s", err.Error())
}
events := make([]model.Event, 0)
for _, event := range item.Events {
var eventMap model.Event
err = json.Unmarshal([]byte(event), &eventMap)
if err != nil {
r.logger.Error("Error unmarshalling events", errorsV2.Attr(err))
return nil, errorsV2.Newf(errorsV2.TypeInternal, errorsV2.CodeInternal, "getFlamegraphSpansForTrace: error in unmarshalling events %s", err.Error())
}
events = append(events, eventMap)
}
jsonItem := model.FlamegraphSpan{
SpanID: item.SpanID,
TraceID: item.TraceID,
ServiceName: item.ServiceName,
Name: item.Name,
DurationNano: item.DurationNano,
HasError: item.HasError,
References: ref,
Events: events,
Children: make([]*model.FlamegraphSpan, 0),
}
if len(req.SelectFields) > 0 {
jsonItem.SetRequestedFields(item, req.SelectFields)
}
// metadata calculation
startTimeUnixNano := uint64(item.TimeUnixNano.UnixNano())
if startTime == 0 || startTimeUnixNano < startTime {
startTime = startTimeUnixNano
}
if endTime == 0 || (startTimeUnixNano+jsonItem.DurationNano) > endTime {
endTime = (startTimeUnixNano + jsonItem.DurationNano)
}
if durationNano == 0 || jsonItem.DurationNano > durationNano {
durationNano = jsonItem.DurationNano
}
jsonItem.TimeUnixNano = uint64(item.TimeUnixNano.UnixNano() / 1000000)
spanIdToSpanNodeMap[jsonItem.SpanID] = &jsonItem
}
// traverse through the map and append each node to the children array of the parent node
// and add missing spans
for _, spanNode := range spanIdToSpanNodeMap {
hasParentSpanNode := false
for _, reference := range spanNode.References {
if reference.RefType == "CHILD_OF" && reference.SpanId != "" {
hasParentSpanNode = true
if parentNode, exists := spanIdToSpanNodeMap[reference.SpanId]; exists {
parentNode.Children = append(parentNode.Children, spanNode)
} else {
// insert the missing spans
missingSpan := model.FlamegraphSpan{
SpanID: reference.SpanId,
TraceID: spanNode.TraceID,
ServiceName: "",
Name: "Missing Span",
TimeUnixNano: spanNode.TimeUnixNano,
DurationNano: spanNode.DurationNano,
HasError: false,
Events: make([]model.Event, 0),
Children: make([]*model.FlamegraphSpan, 0),
}
missingSpan.Children = append(missingSpan.Children, spanNode)
spanIdToSpanNodeMap[missingSpan.SpanID] = &missingSpan
traceRoots = append(traceRoots, &missingSpan)
}
}
}
if !hasParentSpanNode && !tracedetail.ContainsFlamegraphSpan(traceRoots, spanNode) {
traceRoots = append(traceRoots, spanNode)
}
}
selectedSpans = tracedetail.GetAllSpansForFlamegraph(traceRoots, spanIdToSpanNodeMap)
// TODO: set the trace data (model.GetFlamegraphSpansForTraceCache) in cache here
// removed existing cache usage since it was not getting used due to this bug https://github.com/SigNoz/engineering-pod/issues/4648
// and was causing out of memory issues https://github.com/SigNoz/engineering-pod/issues/4638
}
processingPostCache := time.Now()
selectedSpansForRequest := selectedSpans
clientLimit := min(req.Limit, tracedetail.MaxLimitWithoutSampling)
totalSpanCount := tracedetail.GetTotalSpanCount(selectedSpans)
if totalSpanCount > uint64(clientLimit) {
// using trace start and end time if boundary ts are set to zero (or not set)
boundaryStart := max(timestamp.MilliToNano(req.BoundaryStartTS), startTime)
boundaryEnd := timestamp.MilliToNano(req.BoundaryEndTS)
if boundaryEnd == 0 {
boundaryEnd = endTime
}
selectedSpansForRequest = tracedetail.GetSelectedSpansForFlamegraphForRequest(req.SelectedSpanID, selectedSpans, boundaryStart, boundaryEnd)
}
r.logger.Debug("getFlamegraphSpansForTrace: processing post cache", "duration", time.Since(processingPostCache), "traceID", traceID, "totalSpans", totalSpanCount, "limit", clientLimit)
trace.Spans = selectedSpansForRequest
trace.StartTimestampMillis = startTime / 1000000
trace.EndTimestampMillis = endTime / 1000000
trace.HasMore = totalSpanCount > uint64(clientLimit)
return trace, nil
}
func (r *ClickHouseReader) GetDependencyGraph(ctx context.Context, queryParams *model.GetServicesParams) (*[]model.ServiceMapDependencyResponseItem, error) {
ctx = ctxtypes.NewContextWithCommentVals(ctx, map[string]string{

View File

@@ -534,6 +534,8 @@ func (aH *APIHandler) RegisterRoutes(router *mux.Router, am *middleware.AuthZ) {
router.HandleFunc("/api/v2/traces/fields", am.ViewAccess(aH.traceFields)).Methods(http.MethodGet)
router.HandleFunc("/api/v2/traces/fields", am.EditAccess(aH.updateTraceField)).Methods(http.MethodPost)
router.HandleFunc("/api/v2/traces/flamegraph/{traceId}", am.ViewAccess(aH.GetFlamegraphSpansForTrace)).Methods(http.MethodPost)
router.HandleFunc("/api/v1/version", am.OpenAccess(aH.getVersion)).Methods(http.MethodGet)
router.HandleFunc("/api/v1/features", am.ViewAccess(aH.getFeatureFlags)).Methods(http.MethodGet)
@@ -1444,6 +1446,40 @@ func (aH *APIHandler) SearchTraces(w http.ResponseWriter, r *http.Request) {
}
func (aH *APIHandler) GetFlamegraphSpansForTrace(w http.ResponseWriter, r *http.Request) {
claims, err := authtypes.ClaimsFromContext(r.Context())
if err != nil {
render.Error(w, err)
return
}
orgID, err := valuer.NewUUID(claims.OrgID)
if err != nil {
render.Error(w, err)
return
}
traceID := mux.Vars(r)["traceId"]
if traceID == "" {
render.Error(w, errors.NewInvalidInputf(errors.CodeInvalidInput, "traceID is required"))
return
}
req := new(model.GetFlamegraphSpansForTraceParams)
err = json.NewDecoder(r.Body).Decode(&req)
if err != nil {
RespondError(w, model.BadRequest(err), nil)
return
}
result, apiErr := aH.reader.GetFlamegraphSpansForTrace(r.Context(), orgID, traceID, req)
if apiErr != nil {
render.Error(w, apiErr)
return
}
aH.WriteJSON(w, r, result)
}
func (aH *APIHandler) listErrors(w http.ResponseWriter, r *http.Request) {
query, err := parseListErrorsRequest(r)

View File

@@ -10,7 +10,6 @@ import (
"time"
"github.com/DATA-DOG/go-sqlmock"
cmock "github.com/SigNoz/clickhouse-go-mock"
"github.com/SigNoz/signoz/pkg/cache"
"github.com/SigNoz/signoz/pkg/cache/cachetest"
"github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest"
@@ -27,6 +26,7 @@ import (
"github.com/SigNoz/signoz/pkg/telemetrystore"
"github.com/SigNoz/signoz/pkg/telemetrystore/telemetrystoretest"
"github.com/SigNoz/signoz/pkg/valuer"
cmock "github.com/SigNoz/clickhouse-go-mock"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -1409,6 +1409,8 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)
@@ -1633,6 +1635,8 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)
@@ -1932,6 +1936,8 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)
@@ -2158,6 +2164,8 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)

View File

@@ -1461,6 +1461,8 @@ func Test_querier_Traces_runWindowBasedListQueryDesc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)
@@ -1685,6 +1687,8 @@ func Test_querier_Traces_runWindowBasedListQueryAsc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)
@@ -1983,6 +1987,8 @@ func Test_querier_Logs_runWindowBasedListQueryDesc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)
@@ -2209,6 +2215,8 @@ func Test_querier_Logs_runWindowBasedListQueryAsc(t *testing.T) {
telemetryStore,
prometheustest.New(context.Background(), instrumentationtest.New().ToProviderSettings(), prometheus.Config{Timeout: 2 * time.Minute}, telemetryStore),
"",
time.Duration(time.Second),
nil,
nil,
options,
)

View File

@@ -7,6 +7,7 @@ import (
"net/http"
"slices"
"github.com/SigNoz/signoz/pkg/cache/memorycache"
"github.com/SigNoz/signoz/pkg/errors"
"github.com/SigNoz/signoz/pkg/queryparser"
@@ -15,6 +16,7 @@ import (
"github.com/rs/cors"
"github.com/soheilhy/cmux"
"github.com/SigNoz/signoz/pkg/cache"
"github.com/SigNoz/signoz/pkg/http/middleware"
"github.com/SigNoz/signoz/pkg/licensing/nooplicensing"
"github.com/SigNoz/signoz/pkg/query-service/agentConf"
@@ -58,12 +60,25 @@ func NewServer(config signoz.Config, signoz *signoz.SigNoz) (*Server, error) {
return nil, err
}
cacheForTraceDetail, err := memorycache.New(context.TODO(), signoz.Instrumentation.ToProviderSettings(), cache.Config{
Provider: "memory",
Memory: cache.Memory{
NumCounters: 10 * 10000,
MaxCost: 1 << 27, // 128 MB
},
})
if err != nil {
return nil, err
}
reader := clickhouseReader.NewReader(
signoz.Instrumentation.Logger(),
signoz.SQLStore,
signoz.TelemetryStore,
signoz.Prometheus,
signoz.TelemetryStore.Cluster(),
config.Querier.FluxInterval,
cacheForTraceDetail,
signoz.Cache,
nil,
)

View File

@@ -0,0 +1,199 @@
package tracedetail
import (
"sort"
"github.com/SigNoz/signoz/pkg/query-service/model"
)
var (
flamegraphSpanLevelLimit float64 = 50
flamegraphSpanLimitPerLevel int = 100
flamegraphSamplingBucketCount int = 50
flamegraphTopLatencySpanCount int = 5
MaxLimitWithoutSampling uint = 120_000
)
func ContainsFlamegraphSpan(slice []*model.FlamegraphSpan, item *model.FlamegraphSpan) bool {
for _, v := range slice {
if v.SpanID == item.SpanID {
return true
}
}
return false
}
func BfsTraversalForTrace(span *model.FlamegraphSpan, level int64) map[int64][]*model.FlamegraphSpan {
bfs := map[int64][]*model.FlamegraphSpan{}
bfs[level] = []*model.FlamegraphSpan{span}
for _, child := range span.Children {
childBfsMap := BfsTraversalForTrace(child, level+1)
for _level, nodes := range childBfsMap {
bfs[_level] = append(bfs[_level], nodes...)
}
}
span.Level = level
span.Children = make([]*model.FlamegraphSpan, 0)
return bfs
}
func FindIndexForSelectedSpan(spans [][]*model.FlamegraphSpan, selectedSpanId string) int {
var selectedSpanLevel int = 0
for index, _spans := range spans {
for _, span := range _spans {
if span.SpanID == selectedSpanId {
selectedSpanLevel = index
break
}
}
}
return selectedSpanLevel
}
// GetAllSpansForFlamegraph groups all spans as per their level
func GetAllSpansForFlamegraph(traceRoots []*model.FlamegraphSpan, spanIdToSpanNodeMap map[string]*model.FlamegraphSpan) [][]*model.FlamegraphSpan {
var traceIdLevelledFlamegraph = map[string]map[int64][]*model.FlamegraphSpan{}
selectedSpans := [][]*model.FlamegraphSpan{}
// sort the trace roots to add missing spans at the right order
sort.Slice(traceRoots, func(i, j int) bool {
if traceRoots[i].TimeUnixNano == traceRoots[j].TimeUnixNano {
return traceRoots[i].Name < traceRoots[j].Name
}
return traceRoots[i].TimeUnixNano < traceRoots[j].TimeUnixNano
})
for _, rootSpanID := range traceRoots {
if rootNode, exists := spanIdToSpanNodeMap[rootSpanID.SpanID]; exists {
bfsMapForTrace := BfsTraversalForTrace(rootNode, 0)
traceIdLevelledFlamegraph[rootSpanID.SpanID] = bfsMapForTrace
}
}
for _, trace := range traceRoots {
keys := make([]int64, 0, len(traceIdLevelledFlamegraph[trace.SpanID]))
for key := range traceIdLevelledFlamegraph[trace.SpanID] {
keys = append(keys, key)
}
sort.Slice(keys, func(i, j int) bool {
return keys[i] < keys[j]
})
for _, level := range keys {
if ok, exists := traceIdLevelledFlamegraph[trace.SpanID][level]; exists {
selectedSpans = append(selectedSpans, ok)
}
}
}
return selectedSpans
}
func getLatencyAndTimestampBucketedSpans(spans []*model.FlamegraphSpan, selectedSpanID string, isSelectedSpanIDPresent bool, startTime uint64, endTime uint64) []*model.FlamegraphSpan {
var sampledSpans []*model.FlamegraphSpan
// sort the spans by latency for latency filtering
sort.Slice(spans, func(i, j int) bool {
return spans[i].DurationNano > spans[j].DurationNano
})
// pick the top 5 latency spans
for idx := range flamegraphTopLatencySpanCount {
sampledSpans = append(sampledSpans, spans[idx])
}
// always add the selectedSpan
if isSelectedSpanIDPresent {
idx := -1
for _idx, span := range spans {
if span.SpanID == selectedSpanID {
idx = _idx
}
}
if idx != -1 {
sampledSpans = append(sampledSpans, spans[idx])
}
}
bucketSize := (endTime - startTime) / uint64(flamegraphSamplingBucketCount)
if bucketSize == 0 {
bucketSize = 1
}
bucketedSpans := make([][]*model.FlamegraphSpan, flamegraphSamplingBucketCount)
for _, span := range spans {
if span.TimeUnixNano >= startTime && span.TimeUnixNano <= endTime {
bucketIndex := int((span.TimeUnixNano - startTime) / bucketSize)
if bucketIndex >= 0 && bucketIndex < flamegraphSamplingBucketCount {
bucketedSpans[bucketIndex] = append(bucketedSpans[bucketIndex], span)
}
}
}
for i := range bucketedSpans {
if len(bucketedSpans[i]) > 2 {
// Keep only the first 2 spans
bucketedSpans[i] = bucketedSpans[i][:2]
}
}
// Flatten the bucketed spans into a single slice
for _, bucket := range bucketedSpans {
sampledSpans = append(sampledSpans, bucket...)
}
return sampledSpans
}
func GetSelectedSpansForFlamegraphForRequest(selectedSpanID string, selectedSpans [][]*model.FlamegraphSpan, startTime uint64, endTime uint64) [][]*model.FlamegraphSpan {
var selectedSpansForRequest = make([][]*model.FlamegraphSpan, 0)
var selectedIndex = 0
if selectedSpanID != "" {
selectedIndex = FindIndexForSelectedSpan(selectedSpans, selectedSpanID)
}
lowerLimit := selectedIndex - int(flamegraphSpanLevelLimit*0.4)
upperLimit := selectedIndex + int(flamegraphSpanLevelLimit*0.6)
if lowerLimit < 0 {
upperLimit = upperLimit - lowerLimit
lowerLimit = 0
}
if upperLimit > len(selectedSpans) {
lowerLimit = lowerLimit - (upperLimit - len(selectedSpans))
upperLimit = len(selectedSpans)
}
if lowerLimit < 0 {
lowerLimit = 0
}
for i := lowerLimit; i < upperLimit; i++ {
if len(selectedSpans[i]) > flamegraphSpanLimitPerLevel {
_spans := getLatencyAndTimestampBucketedSpans(selectedSpans[i], selectedSpanID, i == selectedIndex, startTime, endTime)
selectedSpansForRequest = append(selectedSpansForRequest, _spans)
} else {
selectedSpansForRequest = append(selectedSpansForRequest, selectedSpans[i])
}
}
return selectedSpansForRequest
}
func GetTotalSpanCount(spans [][]*model.FlamegraphSpan) uint64 {
levelCount := len(spans)
spanCount := uint64(0)
for i := range levelCount {
spanCount += uint64(len(spans[i]))
}
return spanCount
}

View File

@@ -43,6 +43,8 @@ type Reader interface {
// Search Interfaces
SearchTraces(ctx context.Context, params *model.SearchTracesParams) (*[]model.SearchSpansResult, error)
GetFlamegraphSpansForTrace(ctx context.Context, orgID valuer.UUID, traceID string, req *model.GetFlamegraphSpansForTraceParams) (*model.GetFlamegraphSpansForTraceResponse, error)
// Setter Interfaces
SetTTL(ctx context.Context, orgID string, ttlParams *retentiontypes.TTLParams) (*retentiontypes.SetTTLResponseItem, *model.ApiError)
SetTTLV2(ctx context.Context, orgID string, params *retentiontypes.CustomRetentionTTLParams) (*retentiontypes.CustomRetentionTTLResponse, error)

View File

@@ -0,0 +1,42 @@
package model
import (
"encoding/json"
"github.com/SigNoz/signoz/pkg/types/cachetypes"
)
type GetFlamegraphSpansForTraceCache struct {
StartTime uint64 `json:"startTime"`
EndTime uint64 `json:"endTime"`
DurationNano uint64 `json:"durationNano"`
SelectedSpans [][]*FlamegraphSpan `json:"selectedSpans"`
TraceRoots []*FlamegraphSpan `json:"traceRoots"`
}
func (c *GetFlamegraphSpansForTraceCache) Clone() cachetypes.Cacheable {
return &GetFlamegraphSpansForTraceCache{
StartTime: c.StartTime,
EndTime: c.EndTime,
DurationNano: c.DurationNano,
SelectedSpans: c.SelectedSpans,
TraceRoots: c.TraceRoots,
}
}
func (c *GetFlamegraphSpansForTraceCache) Cost() int64 {
const perSpanBytes = 128
var spans int64
for _, row := range c.SelectedSpans {
spans += int64(len(row))
}
spans += int64(len(c.TraceRoots))
return spans * perSpanBytes
}
func (c *GetFlamegraphSpansForTraceCache) MarshalBinary() (data []byte, err error) {
return json.Marshal(c)
}
func (c *GetFlamegraphSpansForTraceCache) UnmarshalBinary(data []byte) error {
return json.Unmarshal(data, c)
}

View File

@@ -2,6 +2,8 @@ package model
import (
"time"
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
)
type InstantQueryMetricsParams struct {
@@ -329,6 +331,14 @@ type SearchTracesParams struct {
MaxSpansInTrace int `json:"maxSpansInTrace"`
}
type GetFlamegraphSpansForTraceParams struct {
SelectedSpanID string `json:"selectedSpanId"`
Limit uint `json:"limit"`
BoundaryStartTS uint64 `json:"boundaryStartTsMilli"`
BoundaryEndTS uint64 `json:"boundarEndTsMilli"`
SelectFields []telemetrytypes.TelemetryFieldKey `json:"selectFields"`
}
type SpanFilterParams struct {
TraceID []string `json:"traceID"`
Status []string `json:"status"`

View File

@@ -7,6 +7,7 @@ import (
"strconv"
"time"
"github.com/SigNoz/signoz/pkg/types/telemetrytypes"
"github.com/pkg/errors"
"github.com/prometheus/prometheus/promql/parser"
"github.com/prometheus/prometheus/util/stats"
@@ -239,6 +240,13 @@ type SearchSpanDBResponseItem struct {
Model string `ch:"model"`
}
type Event struct {
Name string `json:"name,omitempty"`
TimeUnixNano uint64 `json:"timeUnixNano,omitempty"`
AttributeMap map[string]interface{} `json:"attributeMap,omitempty"`
IsError bool `json:"isError,omitempty"`
}
//easyjson:json
type SearchSpanResponseItem struct {
TimeUnixNano uint64 `json:"timestamp"`
@@ -259,6 +267,79 @@ type SearchSpanResponseItem struct {
SpanKind string `json:"spanKind"`
}
type Span struct {
TimeUnixNano uint64 `json:"timestamp"`
DurationNano uint64 `json:"durationNano"`
SpanID string `json:"spanId"`
RootSpanID string `json:"rootSpanId"`
TraceID string `json:"traceId"`
HasError bool `json:"hasError"`
Kind int32 `json:"kind"`
ServiceName string `json:"serviceName"`
Name string `json:"name"`
References []OtelSpanRef `json:"references,omitempty"`
TagMap map[string]string `json:"tagMap"`
Events []Event `json:"event"`
RootName string `json:"rootName"`
StatusMessage string `json:"statusMessage"`
StatusCodeString string `json:"statusCodeString"`
SpanKind string `json:"spanKind"`
Children []*Span `json:"children"`
// the below two fields are for frontend to render the spans
SubTreeNodeCount uint64 `json:"subTreeNodeCount"`
HasChildren bool `json:"hasChildren"`
HasSiblings bool `json:"hasSiblings"`
Level uint64 `json:"level"`
}
type FlamegraphSpan struct {
TimeUnixNano uint64 `json:"timestamp"`
DurationNano uint64 `json:"durationNano"`
SpanID string `json:"spanId"`
TraceID string `json:"traceId"`
HasError bool `json:"hasError"`
ServiceName string `json:"serviceName"`
Name string `json:"name"`
Level int64 `json:"level"`
Events []Event `json:"event"`
References []OtelSpanRef `json:"references,omitempty"`
Children []*FlamegraphSpan `json:"children"`
Attributes map[string]any `json:"attributes,omitempty"`
Resource map[string]string `json:"resource,omitempty"`
}
// SetRequestedFields extracts the requested attribute/resource fields from item into s.
// This can eventually support missing fieldContext by checking both
func (s *FlamegraphSpan) SetRequestedFields(item SpanItemV2, fields []telemetrytypes.TelemetryFieldKey) {
for _, field := range fields {
switch field.FieldContext {
case telemetrytypes.FieldContextResource:
if v, ok := item.Resources_string[field.Name]; ok && v != "" {
if s.Resource == nil {
s.Resource = make(map[string]string)
}
s.Resource[field.Name] = v
}
case telemetrytypes.FieldContextAttribute:
if v := item.AttributeValue(field.Name); v != nil {
if s.Attributes == nil {
s.Attributes = make(map[string]any)
}
s.Attributes[field.Name] = v
}
}
}
}
type GetFlamegraphSpansForTraceResponse struct {
StartTimestampMillis uint64 `json:"startTimestampMillis"`
EndTimestampMillis uint64 `json:"endTimestampMillis"`
DurationNano uint64 `json:"durationNano"`
Spans [][]*FlamegraphSpan `json:"spans"`
HasMore bool `json:"hasMore"`
}
type OtelSpanRef struct {
TraceId string `json:"traceId,omitempty"`
SpanId string `json:"spanId,omitempty"`

View File

@@ -101,9 +101,29 @@ func (b *MetricQueryStatementBuilder) Build(
return nil, err
}
var pairFallbackWarnings []string
for _, sel := range keySelectors {
if _, ok := keys[sel.Name]; !ok {
keys[sel.Name] = []*telemetrytypes.TelemetryFieldKey{{
Name: sel.Name,
FieldContext: telemetrytypes.FieldContextAttribute,
FieldDataType: telemetrytypes.FieldDataTypeString,
Signal: telemetrytypes.SignalMetrics,
}}
pairFallbackWarnings = append(pairFallbackWarnings,
fmt.Sprintf("key `%s` not found on metric %s", sel.Name, query.Aggregations[0].MetricName),
)
}
}
start, end = querybuilder.AdjustedMetricTimeRange(start, end, uint64(query.StepInterval.Seconds()), query)
return b.buildPipelineStatement(ctx, start, end, query, keys, variables)
stmt, err := b.buildPipelineStatement(ctx, start, end, query, keys, variables)
if err != nil {
return nil, err
}
stmt.Warnings = append(stmt.Warnings, pairFallbackWarnings...)
return stmt, nil
}
func (b *MetricQueryStatementBuilder) buildPipelineStatement(

View File

@@ -338,7 +338,6 @@ func isValidLabelValue(v string) bool {
// validate runs during UnmarshalJSON (read + write path).
// Preserves the original pre-existing checks only so that stored rules
// continue to load without errors.
// TODO(srikanthccv): remove this once v1 is deprecated and removed
func (r *PostableRule) validate() error {
var errs []error
@@ -367,13 +366,9 @@ func (r *PostableRule) validate() error {
errs = append(errs, testTemplateParsing(r)...)
if len(errs) > 0 {
messages := make([]string, len(errs))
for i, e := range errs {
messages[i] = e.Error()
}
return errors.NewInvalidInputf(errors.CodeInvalidInput, "alert rule definition is not valid").
WithAdditional(messages...)
joined := errors.Join(errs...)
if joined != nil {
return errors.WrapInvalidInputf(joined, errors.CodeInvalidInput, "validation failed")
}
return nil
}
@@ -471,13 +466,9 @@ func (r *PostableRule) Validate() error {
errs = append(errs, testTemplateParsing(r)...)
if len(errs) > 0 {
messages := make([]string, len(errs))
for i, e := range errs {
messages[i] = e.Error()
}
return errors.NewInvalidInputf(errors.CodeInvalidInput, "alert rule is not valid").
WithAdditional(messages...)
joined := errors.Join(errs...)
if joined != nil {
return errors.WrapInvalidInputf(joined, errors.CodeInvalidInput, "validation failed")
}
return nil
}

View File

@@ -4,23 +4,8 @@ import (
"encoding/json"
"strings"
"testing"
"github.com/SigNoz/signoz/pkg/errors"
)
func errorContains(err error, substr string) bool {
j := errors.AsJSON(err)
if strings.Contains(j.Message, substr) {
return true
}
for _, e := range j.Errors {
if strings.Contains(e.Message, substr) {
return true
}
}
return false
}
// validV1Builder returns a minimal valid v1 builder rule JSON.
func validV1Builder() string {
return `{
@@ -509,7 +494,7 @@ func TestValidate_PostableRule_Common(t *testing.T) {
if tt.wantErr {
if err == nil {
t.Errorf("expected error containing %q, got nil", tt.errSubstr)
} else if tt.errSubstr != "" && !errorContains(err, tt.errSubstr) {
} else if tt.errSubstr != "" && !strings.Contains(err.Error(), tt.errSubstr) {
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, err)
}
} else {
@@ -702,7 +687,7 @@ func TestValidate_V1_ConditionFields(t *testing.T) {
if tt.wantErr {
if validateErr == nil {
t.Errorf("expected Validate() error containing %q, got nil", tt.errSubstr)
} else if tt.errSubstr != "" && !errorContains(validateErr, tt.errSubstr) {
} else if tt.errSubstr != "" && !strings.Contains(validateErr.Error(), tt.errSubstr) {
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, validateErr)
}
} else {
@@ -1044,7 +1029,7 @@ func TestValidate_V2Alpha1(t *testing.T) {
if tt.wantErr {
if err == nil {
t.Errorf("expected error containing %q, got nil", tt.errSubstr)
} else if tt.errSubstr != "" && !errorContains(err, tt.errSubstr) {
} else if tt.errSubstr != "" && !strings.Contains(err.Error(), tt.errSubstr) {
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, err)
}
} else {
@@ -1352,7 +1337,7 @@ func TestValidate_MultipleErrors(t *testing.T) {
t.Fatal("expected unmarshal error for wrong version")
}
// The error should mention version
if !errorContains(err, "version") {
if !strings.Contains(err.Error(), "version") {
t.Errorf("expected error to mention version, got: %v", err)
}
})
@@ -1370,9 +1355,10 @@ func TestValidate_MultipleErrors(t *testing.T) {
if validateErr == nil {
t.Fatal("expected Validate() error")
}
errStr := validateErr.Error()
// Should contain errors for thresholds, evaluation, notificationSettings
for _, substr := range []string{"evaluation", "notificationSettings"} {
if !errorContains(validateErr, substr) {
if !strings.Contains(errStr, substr) {
t.Errorf("expected error to mention %q, got: %v", substr, validateErr)
}
}
@@ -1483,7 +1469,7 @@ func TestValidate_V2Alpha1_CumulativeEvaluation(t *testing.T) {
if tt.wantErr {
if err == nil {
t.Errorf("expected error containing %q, got nil", tt.errSubstr)
} else if !errorContains(err, tt.errSubstr) {
} else if !strings.Contains(err.Error(), tt.errSubstr) {
t.Errorf("expected error containing %q, got: %v", tt.errSubstr, err)
}
} else if err != nil {

View File

@@ -614,7 +614,7 @@ def test_histogram_p90_returns_warning_outside_data_window(
assert warnings[0]["message"].startswith(f"no data found for the metric {metric_name}")
def test_non_existent_metrics_returns_404(
def test_non_existent_metrics_returns_warning(
signoz: types.SigNoz,
create_user_admin: None, # pylint: disable=unused-argument
get_token: Callable[[str, str], str],
@@ -635,9 +635,11 @@ def test_non_existent_metrics_returns_404(
start_2h = int((now - timedelta(hours=2)).timestamp() * 1000)
response = make_query_request(signoz, token, start_2h, end_ms, [query])
assert response.status_code == HTTPStatus.NOT_FOUND
assert response.status_code == HTTPStatus.OK
assert get_error_message(response.json()) == "could not find the metric whatevergoennnsgoeshere"
data = response.json()
warnings = get_all_warnings(data)
assert any("whatevergoennnsgoeshere" in w and "has never been received" in w for w in warnings), f"expected never-seen metric warning, got: {warnings}"
def test_non_existent_internal_metrics_returns_no_warning(